# Import Libraries

In [123]:
import pandas as pd
import numpy as np

In [112]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *

loading the sms dataset into a pandas dataframe

In [113]:
df = pd.read_csv('data/sms-spam-collection-dataset/spam.csv', encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


remove column 2, 3 and 4 as they have no useful information

In [114]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df['SMS'] = df['v2']
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df.drop(['v1', 'v2'], axis=1, inplace=True)
train_data = df[:4400]
test_data = df[4400:]
df.head()

Unnamed: 0,SMS,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [116]:
def perform(classifiers, vectorizers, train_data, test_data):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(train_data.SMS)
            classifier.fit(vectorize_text, train_data.label)

            # score
            vectorize_text = vectorizer.transform(test_data.SMS)
            score = classifier.score(vectorize_text, test_data.label)
            name = classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__ 
            print(name, score)
        if score > max_score:
            max_score = score
            max_name = name
    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

list of various classifiers we are going to use

In [117]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

list of various vectorizers we are going to use

In [118]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

perform classification and save results to a new dataframe

In [119]:
perform(
    classifiers,
    vectorizers,
    train_data,
    test_data
)

BernoulliNB with CountVectorizer 0.9778156996587031
BernoulliNB with TfidfVectorizer 0.9778156996587031
BernoulliNB with HashingVectorizer 0.8728668941979523
RandomForestClassifier with CountVectorizer 0.976962457337884
RandomForestClassifier with TfidfVectorizer 0.9761092150170648
RandomForestClassifier with HashingVectorizer 0.9658703071672355
AdaBoostClassifier with CountVectorizer 0.9718430034129693
AdaBoostClassifier with TfidfVectorizer 0.9692832764505119
AdaBoostClassifier with HashingVectorizer 0.9735494880546075
BaggingClassifier with CountVectorizer 0.9684300341296929
BaggingClassifier with TfidfVectorizer 0.9658703071672355
BaggingClassifier with HashingVectorizer 0.9701365187713311
ExtraTreesClassifier with CountVectorizer 0.9726962457337884
ExtraTreesClassifier with TfidfVectorizer 0.9778156996587031
ExtraTreesClassifier with HashingVectorizer 0.9650170648464164
GradientBoostingClassifier with CountVectorizer 0.9709897610921502
GradientBoostingClassifier with TfidfVectoriz

In [120]:
# train the classifier with best accuracy
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
Vectorizer = TfidfVectorizer()
vectorize_text = Vectorizer.fit_transform(train_data.SMS)
Classifier.fit(vectorize_text, train_data.label)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          n_jobs=None)

In [121]:
SMS = ' won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C'
vectorize_message = Vectorizer.transform([SMS])
predict = Classifier.predict(vectorize_message)[0]

In [122]:
if predict == 0:
    print ('ham')
else:
    print ('spam')

spam
