In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import model_selection
from sklearn.model_selection import KFold

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [4]:
sms = pd.read_table("sms.tsv",header = None,names = ["label","message"])

In [5]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
sms["label"] = sms.label.map({'ham':0,"spam":1})

### żeby model działał musi mieć predykcje lepszą nić 87%!

In [7]:
sms.label.mean()

0.13406317300789664

In [8]:
X = sms.message
y = sms.label

In [69]:
X.shape

(5572,)

### należy wyrzucić najczęściej występujące i najrzadziej występujące słowa

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 1000,random_state = 123)

## Naive Bayes

In [63]:
pipeline_NB = Pipeline([('counter',CountVectorizer()),
                       ('Bayes',MultinomialNB())])

param_grid_NB = {"counter__max_df": [0.8,0.9,1.0],
              "counter__min_df":[0.0,0.02,0.05]}

## Logistic regression

In [66]:
pipeline_log = Pipeline([('counter',CountVectorizer()),
                       ('scaler',StandardScaler(with_mean=False)),
                        ('logreg',LogisticRegression())])
param_grid_log = {"counter__max_df": [0.8,0.9,1.0],
              "counter__min_df":[0.0,0.02,0.05],
              "logreg__penalty":["l1","l2"],
            'logreg__C': [0.5,1.0,5.0]}

## Decision tree

In [68]:
pipeline_tree = Pipeline([('counter',CountVectorizer()),
                        ('tree',DecisionTreeClassifier())])
param_grid_tree = {"counter__max_df": [0.8,0.9,1.0],
              "counter__min_df":[0.0,0.02,0.05],
              "tree__criterion":['gini','entropy'],
              "tree__min_samples_split":[5,10,20]}

In [62]:
names = np.array(['NaiveBayes','LogisticRegression','DecisionTreeClasifier'])
params = np.array([param_grid_NB,param_grid_log,param_grid_tree])
pipelines = np.array([pipeline_NB,pipeline_log,pipeline_tree])

In [61]:
for pipe,param_grid,name in zip(pipelines,params,names):
    print(name)
    grid = GridSearchCV(pipe,
                       param_grid,
                       cv=10,
                        refit=True,
                       n_jobs=3)
    grid.fit(X_train,y_train)
    
    print(grid.best_params_)
    print(grid.best_score_)
    print(accuracy_score(grid.predict(X_test),y_test))

{'counter__max_df': 0.8, 'counter__min_df': 0.0}
0.986220472441
0.986
{'counter__max_df': 0.8, 'counter__min_df': 0.0, 'logreg__C': 1.0, 'logreg__penalty': 'l1'}
0.978783902012
0.978
{'counter__max_df': 1.0, 'counter__min_df': 0.0, 'tree__criterion': 'gini', 'tree__min_samples_split': 5}
0.971128608924
0.961
