#### Spam Detection

Data Source: http://archive.ics.uci.edu/ml/machine-learning-databases/00228/<br/>
Application: https://www.youtube.com/watch?v=RZYjsw6P4nI, https://www.youtube.com/watch?v=bPYJi1E9xeM

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('Data\SMSSpamCollection.csv', sep = '\t', names = ['Status', 'Message'])
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
len(df)

5572

In [4]:
# Instead of operating on entire df we take only 20 rows so that we can easily visualize
df = df.head(200)

In [5]:
print("No. of non-spam messages = ", len(df[df.Status == 'ham']))
print("No. of spam messages = ", len(df[df.Status == 'spam']))

No. of non-spam messages =  167
No. of spam messages =  33


In [6]:
df_x = df["Message"]
df_y = df["Status"]
print(df_x.head())
print()
print(df_y.head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Status, dtype: object


In [7]:
# We've two tasks to perform
# 1) vectorization of df_x text data (as we can't use the text as it is)
# 2) transform df_y to numerics

In [8]:
# Step 1) vectorization of df_x text data (as we can't use the text as it is)

In [9]:
cv = TfidfVectorizer()

cv_x = cv.fit_transform(df_x)
print(cv.get_feature_names())

['000', '03', '07046744435', '07732584351', '07742676969', '0800', '08000930705', '08002986030', '0808', '08452810075over18', '08700621170150p', '08712300220', '0871277810810', '0871277810910p', '08719180248', '09061209465', '09061701461', '09061701939', '09064012160', '09064019788', '09066364589', '10', '100', '1000', '10am', '11', '11pm', '12', '12hrs', '145', '1500', '150p', '150pm', '150ppm', '16', '169', '18', '1pm', '20', '2000', '2004', '2005', '21st', '2nd', '30th', '31p', '32', '350', '3aj', '4041', '434', '4403ldnw1a7rw18', '450ppw', '45239', '4742', '4the', '4txt', '50', '5000', '5249', '530', '5we', '6031', '62468', '69888', '69988', '6days', '7548', '786', '81010', '82277', '85069', '87077', '87121', '87575', '8am', '900', '92h', '9am', '9pm', 'aaooooright', 'abiola', 'able', 'about', 'abt', 'ac', 'accomodate', 'accomodations', 'account', 'aco', 'actin', 'activities', 'address', 'advise', 'aft', 'after', 'afternoon', 'again', 'age', 'ah', 'ahead', 'ahhh', 'aids', 'aight', 

In [10]:
cv_x = cv_x.toarray()
print(cv_x[0])

[0. 0. 0. ... 0. 0. 0.]


In [11]:
# Verify first row
print(cv.inverse_transform(cv_x[0]))

[array(['amore', 'available', 'buffet', 'bugis', 'cine', 'crazy', 'go',
       'got', 'great', 'in', 'jurong', 'la', 'only', 'point', 'there',
       'until', 'wat', 'world'], dtype='<U18')]


In [12]:
# 2) transform df_y to numerics

In [13]:
# The response variable is string and we need to transform to numeric form
df.loc[df['Status'] == 'ham', 'Status'] = 0
df.loc[df['Status'] == 'spam', 'Status'] = 1
df.head()

Unnamed: 0,Status,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# We notice that the response variable 'Status' is of 'Object' type so we need to convert to 'int'
df['Status'].head()

0    0
1    0
2    1
3    0
4    0
Name: Status, dtype: object

In [15]:
df['Status'] = df['Status'].astype(int)
df['Status'].head()

0    0
1    0
2    1
3    0
4    0
Name: Status, dtype: int32

In [16]:
df_x = df["Message"]
df_y = df["Status"]

In [17]:
df_x.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [18]:
# Our transformed features are now
cv_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
df_y.head()

0    0
1    0
2    1
3    0
4    0
Name: Status, dtype: int32

In [20]:
# So our original problem with df['Message', 'Status'] is transformed to cv_x, df_y

In [21]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(cv_x, df_y, test_size = 0.2)

In [22]:
mnb = MultinomialNB()

In [23]:
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
# We predict using transformed vectors
pred = mnb.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
actual = np.array(y_test)
actual

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [26]:
print(len(x_train), len(x_test), len(y_train), len(y_test), len(pred))

160 40 160 40 40


In [27]:
# Let us calculate accuracy
count = 0
for i in range(len(pred)):
    if (pred[i] == actual[i]):
        count = count + 1
count

35

In [28]:
len(pred)

40

In [29]:
accuracy = count / len(pred) * 100
accuracy

87.5