In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline

In [None]:

data=pd.read_csv(
    '/home/jimbrootan/Desktop/ML_data/smsspamcollection/SMSSpamCollection',
                 delimiter='\t',
                 header=None
                )
data

X_train,X_test,y_train,y_test=train_test_split(data.iloc[:,1],data.iloc[:,0])

label=LabelBinarizer()
pipeline=Pipeline([
    ('vect',TfidfVectorizer(stop_words='English')),
    ('clf',LogisticRegression())
])

In [None]:
"""HYPER_PARAMETER && Permutaion and combinations"""

parameters= {
    'vect__max_df':(0.25,0.5,0.75),
    'vect__stop_words':('english',None),
    'vect__max_features':(2500,5000,10000,None),
    'vect__ngram_range':((1,1),(1,2)),
    'vect__use_idf':(True,False),
    'vect__norm':('l1','l2'),
    'clf__penalty':('l1','l2'),
    'clf__C':(0.01,0.1,1,0)
}

In [None]:
"""Grid Searching"""

grid_search= GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='accuracy',cv=3)
y_train=np.array([number[0] for number in label.fit_transform(y_train)])


grid_search.fit(X_train,y_train)
print("BEST SCORE : %0.3f"%grid_search.best_score_)
print("BEST PARAMETER SET: : ")
best_params=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters):
    print('%s :%r'%(param_name,best_params[param_name]))

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

y_test=np.array([number[0] for number in label.fit_transform(y_test)])

pred=grid_search.predict(X_test)
print("ACCURACY_SCORE: %0.5f"%accuracy_score(y_test,pred))
print("PRECISION_SCORE: %0.5f"%precision_score(y_test,pred))
print("RECALL_SCORE: %0.5f"%recall_score(y_test,pred))
print("F1_SCORE: %0.5f"%f1_score(y_test,pred))
