In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
to_numeric = CountVectorizer()

In [5]:
df = pd.read_table('sms.tsv',header=None,names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.shape

(5572, 2)

In [10]:
df['label_num'] = df['label'].map({'ham':0,'spam':1})

In [12]:
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [14]:
X = df['message']
y = df['label_num']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 7)

In [20]:
print('X_Train : ',X_train.shape)
print('y_Train : ',y_train.shape)
print('X_Test : ',X_test.shape)
print('y_Test : ',y_test.shape)

X_Train :  (4179,)
y_Train :  (4179,)
X_Test :  (1393,)
y_Test :  (1393,)


In [21]:
to_numeric = CountVectorizer()

In [22]:
# Create voccabulary
to_numeric.fit(X_train)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [25]:
to_numeric.transform(X_train)

<4179x7535 sparse matrix of type '<class 'numpy.int64'>'
	with 56144 stored elements in Compressed Sparse Row format>

In [26]:
X_train_DTM = pd.DataFrame(to_numeric.transform(X_train).toarray(),columns=to_numeric.get_feature_names())
X_train_DTM.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,0125698789,02,0207,...,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_test_DTM = pd.DataFrame(to_numeric.transform(X_test).toarray(),columns=to_numeric.get_feature_names())
X_train_DTM.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,0125698789,02,0207,...,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Building

In [29]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [31]:
nb.fit(X_train_DTM,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
y_pred = nb.predict(X_test_DTM)

In [34]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score

In [38]:
confusion_matrix(y_test,y_pred)

array([[1194,    7],
       [  18,  174]], dtype=int64)

In [39]:
accuracy_score(y_test,y_pred)

0.9820531227566404

In [40]:
precision_score(y_test,y_pred)

0.9613259668508287

In [41]:
recall_score(y_test,y_pred)

0.90625

In [49]:
y_pred_proba = nb.predict_proba(X_test_DTM)[:,1]

In [51]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)

0.9922894116014432

# Compare Multiple models

In [42]:
from sklearn.linear_model import LogisticRegression

In [43]:
lr = LogisticRegression()