# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)

df = pd.read_csv('output/spam_email.csv')

In [2]:
# spliting test and train samples
from sklearn.model_selection import train_test_split

df_predictors = df.drop('spam', axis = 1)
df_predicted = df['spam']

X_train, X_test, y_train, y_test = train_test_split(df_predictors, df_predicted)

In [3]:
# training the model 
from sklearn.linear_model import LogisticRegression

LRmodel = LogisticRegression(solver='lbfgs', max_iter=2000)
LRmodel.fit(X_train, y_train)

In [4]:
# prediction score
LRmodel.score(X_test, y_test)

0.9348392701998263

The model is capable to detect in average 93.49% of spam emails;

In [5]:
# error 
from sklearn.metrics import mean_squared_error
LRmodel.predict(X_train)

mean_squared_error(y_test, LRmodel.predict(X_test))

0.06516072980017376

# Resampling methods

## Cross Validation

In [21]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LRmodel, X_train, y_train, 
                         cv = 20)

In [22]:
def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean())
    print("\nStd deviation:", scores.std())

display_scores(scores)

Scores: [0.9017341  0.94797688 0.89017341 0.90751445 0.93063584 0.90751445
 0.94219653 0.93063584 0.89017341 0.9017341  0.90697674 0.94186047
 0.93023256 0.93023256 0.93604651 0.93604651 0.94186047 0.95348837
 0.93604651 0.94767442]

Mean: 0.9255377066810058

Std deviation: 0.019546376020078658


## K-Fold

In [94]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=20, shuffle = True, random_state = 1)

r_train = []
r_test = []
accuracy = []
for train, test in kf.split(df):
    # print('train: ', train.shape[0])
    # print('test: ', test.shape[0])
    r_train.append(train.shape[0])
    r_test.append(test.shape[0])

    X_traink, X_testk = df_predictors.iloc[train], df_predictors.iloc[test]
    y_traink, y_testk = df_predicted.iloc[train], df_predicted.iloc[test]

    LRmodel = LogisticRegression(solver='lbfgs', max_iter=3000)
    LRmodel.fit(X_traink, y_traink)

    # print('accuracy', LRmodel.score(X_testk, y_testk))
    # print()

    acc = LRmodel.score(X_testk, y_testk)
    accuracy.append(acc)

results = pd.DataFrame()
results['train'] = r_train
results['test'] = r_test
results['accuracy'] = accuracy

print(results)
print('average accuracy:', results['accuracy'].mean())
print()

    train  test  accuracy
0    4370   231  0.922078
1    4371   230  0.934783
2    4371   230  0.917391
3    4371   230  0.956522
4    4371   230  0.939130
5    4371   230  0.926087
6    4371   230  0.913043
7    4371   230  0.926087
8    4371   230  0.921739
9    4371   230  0.921739
10   4371   230  0.943478
11   4371   230  0.873913
12   4371   230  0.913043
13   4371   230  0.926087
14   4371   230  0.930435
15   4371   230  0.934783
16   4371   230  0.939130
17   4371   230  0.939130
18   4371   230  0.934783
19   4371   230  0.956522
average accuracy: 0.9284952004517221



## Repeated K-Fold

In [92]:
from sklearn.model_selection import KFold

acc_mean = []
for rep in range(1, 5): 
    kf = KFold(n_splits=20, shuffle = True, random_state = rep)

    r_train = []
    r_test = []
    accuracy = []
    for train, test in kf.split(df):
        # print('train: ', train.shape[0])
        # print('test: ', test.shape[0])
        r_train.append(train.shape[0])
        r_test.append(test.shape[0])

        X_traink, X_testk = df_predictors.iloc[train], df_predictors.iloc[test]
        y_traink, y_testk = df_predicted.iloc[train], df_predicted.iloc[test]

        LRmodel = LogisticRegression(solver='lbfgs', max_iter=3000)
        LRmodel.fit(X_traink, y_traink)

        # print('accuracy', LRmodel.score(X_testk, y_testk))
        # print()

        acc = LRmodel.score(X_testk, y_testk)
        accuracy.append(acc)

    results = pd.DataFrame()
    results['train'] = r_train
    results['test'] = r_test
    results['accuracy'] = accuracy
    
    print('random_state', rep)
    print(results)
    print('average accuracy:', results['accuracy'].mean())
    print()
    ac = results['accuracy'].mean()
    acc_mean.append(ac)

print('final average accuracy among folds:', np.mean(acc_mean))

random_state 1
    train  test  accuracy
0    4370   231  0.922078
1    4371   230  0.934783
2    4371   230  0.917391
3    4371   230  0.956522
4    4371   230  0.939130
5    4371   230  0.926087
6    4371   230  0.913043
7    4371   230  0.926087
8    4371   230  0.921739
9    4371   230  0.921739
10   4371   230  0.943478
11   4371   230  0.873913
12   4371   230  0.913043
13   4371   230  0.926087
14   4371   230  0.930435
15   4371   230  0.934783
16   4371   230  0.939130
17   4371   230  0.939130
18   4371   230  0.934783
19   4371   230  0.956522
average accuracy: 0.9284952004517221

random_state 2
    train  test  accuracy
0    4370   231  0.948052
1    4371   230  0.960870
2    4371   230  0.886957
3    4371   230  0.917391
4    4371   230  0.913043
5    4371   230  0.913043
6    4371   230  0.956522
7    4371   230  0.930435
8    4371   230  0.960870
9    4371   230  0.926087
10   4371   230  0.956522
11   4371   230  0.891304
12   4371   230  0.917391
13   4371   230  0.917