In [3]:
%run ../include/util.ipynb


import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score


# /Users/dduru/PythonProjects/data/smsspamcollection/SMSSpamCollection



pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.25, 0.5, 0.75),
    'vect__stop_words': ('english', None),
    'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
}


df = read_csv_frame(delimiter = '\t', header = None)
X = df[1].values
Y = df[0].values

label_encoder = LabelEncoder()
label_encoder.fit(Y)
Y = label_encoder.transform(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)


grid_search = GridSearchCV(
    pipeline, 
    parameters, 
    n_jobs=-1, 
    verbose=1, 
    scoring='accuracy', 
    cv=3
)

grid_search.fit(X_train, Y_train)
print('Best score: %0.3f' % grid_search.best_score_)
print('Best Parameter Set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))
    predictions = grid_search.predict(X_test)
    print('Accuracy: ', accuracy_score(Y_test, predictions))
    print('Precision: ', precision_score(Y_test, predictions))
    print('Recall: ', recall_score(Y_test, predictions))
    


Enter path to CSV: /Users/dduru/PythonProjects/data/smsspamcollection/SMSSpamCollection
Fitting 3 folds for each of 1536 candidates, totalling 4608 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed:  8.3min finished


Best score: 0.984
Best Parameter Set:
tclf__C: 10
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tclf__penalty: 'l2'
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__max_df: 0.5
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__max_features: 5000
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__ngram_range: (1, 2)
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__norm: 'l2'
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__stop_words: None
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
tvect__use_idf: True
Accuracy:  0.9877961234745154
Precision:  1.0
Recall:  0.9128205128205128
