## SMS Spam Classifier: Multinomial Naive Bayes

The notebook is divided into the following sections:
1. Importing and preprocessing data
2. Building the model: Multinomial Naive Bayes
    - Model building 
    - Model evaluation

### 1. Importing and Preprocessing Data

In [1]:
import pandas as pd

# reading the training data
docs = pd.read_csv('movie_review_train.csv')
docs.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [2]:
# number of SMSes / documents
len(docs)

1600

In [3]:
docs.columns 

Index(['class', 'text'], dtype='object')

In [4]:
ham_spam = docs['class'].value_counts()


In [5]:
#spam pos
#ham neg
ham_spam

Neg    800
Pos    800
Name: class, dtype: int64

In [6]:
print("pos is about {0}%".format(
    round((ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100), 2))

pos is about 50.0%


In [7]:
# mapping labels to 0 and 1
docs['label'] = docs['class'].map({'Neg':0, 'Pos':1})

In [8]:
docs.head()

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1
1,Pos,whew this film oozes energy the kind of b...,1
2,Pos,steven spielberg s amistad which is bas...,1
3,Pos,he has spent his entire life in an awful litt...,1
4,Pos,being that it is a foreign language film with...,1


In [9]:
# we can now drop the column 'Class'
docs = docs.drop('class', axis=1)
docs.head()

Unnamed: 0,text,label
0,a common complaint amongst film critics is ...,1
1,whew this film oozes energy the kind of b...,1
2,steven spielberg s amistad which is bas...,1
3,he has spent his entire life in an awful litt...,1
4,being that it is a foreign language film with...,1


In [10]:
# convert to X and y
X = docs.text
y = docs.label
print(X.shape)
print(y.shape)

(1600,)
(1600,)


In [11]:
# splitting into test and train
from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
X_train.head()

1145     this movie about two dysfunctional families n...
73       felix   sami bouajila     the siege     lives...
446      vampire lore and legend has always been a pop...
399      kevin smith is like a big kid    his humor is...
647      bruce lee was a bigger than life martial arti...
Name: text, dtype: object

In [13]:
y_train.head()

1145    0
73      1
446     1
399     1
647     1
Name: label, dtype: int64

In [14]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [22]:
 vect2 = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)

In [23]:
vect2.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.03,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
len(vect2.vocabulary_)

1624

In [15]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
# printing the vocabulary
vect.vocabulary_

{'movie': 18546,
 'dysfunctional': 8768,
 'families': 10177,
 'really': 22761,
 'gets': 11728,
 'ground': 12343,
 'despite': 7542,
 'good': 12030,
 'performances': 20704,
 'basically': 2466,
 'competent': 5636,
 'cast': 4442,
 'eddie': 8868,
 'sean': 24854,
 'penn': 20645,
 'maureen': 17485,
 'robin': 23922,
 'wright': 31536,
 'happily': 12699,
 'married': 17322,
 'couple': 6336,
 'luck': 16831,
 'living': 16568,
 'rented': 23298,
 'rooms': 24044,
 'seedier': 24921,
 'unnamed': 29896,
 'city': 5089,
 'spend': 26480,
 'little': 16550,
 'income': 14176,
 'local': 16600,
 'bar': 2368,
 'owned': 20137,
 'shorty': 25433,
 'stanton': 26812,
 'best': 2843,
 'friend': 11243,
 'wife': 31212,
 'georgie': 11696,
 'mazar': 17510,
 'share': 25243,
 'odd': 19539,
 'relationship': 23156,
 'marked': 17291,
 'frequent': 11220,
 'disappearances': 7869,
 'return': 23608,
 'promises': 21985,
 'world': 31467,
 'professes': 21915,
 'undying': 29710,
 'love': 16773,
 'manic': 17182,
 'reunions': 23615,
 'soo

In [35]:
# vocab size
len(vect2.vocabulary_)

1624

In [36]:
# transforming the train and test datasets
X_train_transformed = vect2.transform(X_train)
X_test_transformed = vect2.transform(X_test)

In [42]:
# note that the type is transformed (sparse) matrix
print(type(X_train_transformed))

print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 1)	1
  (0, 67)	1
  (0, 108)	1
  (0, 111)	1
  (0, 116)	1
  (0, 128)	1
  (0, 130)	1
  (0, 168)	1
  (0, 199)	1
  (0, 226)	1
  (0, 235)	1
  (0, 250)	1
  (0, 254)	1
  (0, 286)	1
  (0, 295)	1
  (0, 332)	1
  (0, 343)	1
  (0, 349)	1
  (0, 360)	1
  (0, 428)	14
  (0, 447)	1
  (0, 503)	1
  (0, 518)	2
  (0, 522)	1
  (0, 586)	1
  :	:
  (1199, 1103)	1
  (1199, 1115)	1
  (1199, 1160)	1
  (1199, 1161)	1
  (1199, 1176)	1
  (1199, 1199)	1
  (1199, 1229)	1
  (1199, 1245)	1
  (1199, 1264)	1
  (1199, 1300)	1
  (1199, 1313)	2
  (1199, 1361)	1
  (1199, 1378)	3
  (1199, 1410)	1
  (1199, 1420)	1
  (1199, 1444)	1
  (1199, 1448)	1
  (1199, 1457)	1
  (1199, 1485)	1
  (1199, 1487)	1
  (1199, 1520)	1
  (1199, 1562)	1
  (1199, 1586)	1
  (1199, 1604)	2
  (1199, 1618)	1


### 2. Building and Evaluating the Model

In [None]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)


In [None]:
# note that alpha=1 is used by default for smoothing
mnb

### Model Evaluation

In [None]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# help(metrics.confusion_matrix)

In [None]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

In [None]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

In [None]:
specificity = TN / float(TN + FP)
print("specificity",specificity)

In [None]:
precision = TP / float(TP + FP)
print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

In [None]:
y_pred_class

In [None]:
y_pred_proba

In [None]:
# creating an ROC curve
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
# area under the curve
print (roc_auc)

In [None]:
# matrix of thresholds, tpr, fpr
pd.DataFrame({'Threshold': thresholds, 
              'TPR': true_positive_rate, 
              'FPR':false_positive_rate
             })

In [None]:
# plotting the ROC curve
%matplotlib inline  
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate)