In [1]:
from sklearn.svm import SVC

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import model_selection
from sklearn.model_selection import KFold

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score,classification_report

In [18]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [4]:
sms = pd.read_table("sms.tsv",header = None,names = ["label","message"])

In [5]:
sms["label"] = sms.label.map({'ham':0,"spam":1})

In [7]:
X = sms.message
y = sms.label

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 1000,random_state = 123)

In [28]:
pipeline_svm = Pipeline([('counter',CountVectorizer(max_df=0.8,min_df=0.0)),
                       ('scaler',StandardScaler(with_mean=False)),
                        ('svm',SVC())])
param_grid = [{'svm__kernel':['rbf'],
                  'svm__gamma':[0.0,0.5],
                  'svm__C':[1.0,10.0,0.5]},
                 {'svm__kernel':['poly'],
                  'svm__degree':[1,2],
                 'svm__C':[1.0,10.0,0.5]},
                 {'svm__kernel':['sigmoid'],
                 'svm__C':[1.0,10.0,0.5]}]


In [29]:
grid = GridSearchCV(pipeline_svm,
                    param_grid,
                    cv=5,
                    refit=True,
                    n_jobs=3)
grid.fit(X_train,y_train)
    
print(grid.best_params_)
print(grid.best_score_)

{'svm__C': 1.0, 'svm__kernel': 'sigmoid'}
0.979440069991


In [30]:
best_model = grid.best_estimator_

In [31]:
print(classification_report(y_pred=best_model.predict(X_test),y_true=y_test))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       860
          1       1.00      0.87      0.93       140

avg / total       0.98      0.98      0.98      1000



In [32]:
accuracy_score(grid.predict(X_test),y_test)

0.98199999999999998

In [36]:
grid.grid_scores_



[mean: 0.86724, std: 0.00046, params: {'svm__C': 1.0, 'svm__gamma': 0.0, 'svm__kernel': 'rbf'},
 mean: 0.89808, std: 0.00591, params: {'svm__C': 1.0, 'svm__gamma': 0.5, 'svm__kernel': 'rbf'},
 mean: 0.86724, std: 0.00046, params: {'svm__C': 10.0, 'svm__gamma': 0.0, 'svm__kernel': 'rbf'},
 mean: 0.89829, std: 0.00625, params: {'svm__C': 10.0, 'svm__gamma': 0.5, 'svm__kernel': 'rbf'},
 mean: 0.86724, std: 0.00046, params: {'svm__C': 0.5, 'svm__gamma': 0.0, 'svm__kernel': 'rbf'},
 mean: 0.87095, std: 0.00074, params: {'svm__C': 0.5, 'svm__gamma': 0.5, 'svm__kernel': 'rbf'},
 mean: 0.97266, std: 0.00703, params: {'svm__C': 1.0, 'svm__degree': 1, 'svm__kernel': 'poly'},
 mean: 0.92082, std: 0.00441, params: {'svm__C': 1.0, 'svm__degree': 2, 'svm__kernel': 'poly'},
 mean: 0.97332, std: 0.00681, params: {'svm__C': 10.0, 'svm__degree': 1, 'svm__kernel': 'poly'},
 mean: 0.92542, std: 0.00423, params: {'svm__C': 10.0, 'svm__degree': 2, 'svm__kernel': 'poly'},
 mean: 0.96982, std: 0.00725, params