# SMS Spam Classifier

[SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection)

In [94]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [95]:
df = pd.read_csv('SMSSpamCollection',
                 sep='\t',
                 names=['Status', 'Message'])
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [96]:
# Total number of examples
len(df)

5572

In [97]:
# Number of spam examples
len(df[df.Status=='spam'])

747

In [98]:
# Replace ham with 0 and spam with 1
df.loc[df["Status"] == 'ham', "Status"] = 0
df.loc[df["Status"] == 'spam', "Status"] = 1

In [99]:
df.head()

Unnamed: 0,Status,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
# Split the data
df_x = df["Message"]
df_y = df["Status"]

In [174]:
vectorizer = CountVectorizer(stop_words = 'english')
#vectorizer = TfidfVectorizer(stop_words = 'english')

In [175]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)


In [176]:
features = vectorizer.fit_transform(x_train)

In [177]:
words_array = features.toarray()
words_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [178]:
words_array[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [179]:
len(words_array[0])

7498

In [180]:
vectorizer.inverse_transform(words_array[0])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'online',
        'replying', 'sleeping', 'spys', 'wat'],
       dtype='<U27')]

In [181]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [182]:
mnb = MultinomialNB()

In [183]:
y_train = y_train.astype('int')  # Cast y_train to integer values

In [184]:
mnb.fit(features, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [185]:
test_features = vectorizer.transform(x_test)

In [186]:
predictions = mnb.predict(test_features)

In [187]:
predictions

array([0, 0, 0, ..., 0, 0, 1])

In [188]:
actual = np.array(y_test)

In [189]:
actual

array([0, 0, 0, ..., 0, 0, 1], dtype=object)

In [214]:
num_correct = (predictions == actual).sum()

In [215]:
num_correct

1096

In [220]:
total_examples = len(predictions)
total_examples

1115

In [None]:
# Get the accuracy
accuracy = num_correct / total_examples * 100
round(accuracy, 2)

In [218]:
mnb.predict(features[0])

array([0])

In [219]:
x_test.iloc[0]

'somewhere out there beneath the pale moon light someone think in of u some where out there where dreams come true... goodnite &amp; sweet dreams'