In [156]:
import nltk # NLP Library
import re  # regular expression

In [157]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [158]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [159]:
# File path, UCI repository data: https://archive.ics.uci.edu/ml/machine-learning-databases/00228/
path_data='C:\\Users\\Ejaz\\Documents\\ML Practise\\ML code\\Jan2020\\Spam Classifier\\smsspamcollection\\SMSSpamCollection'

In [160]:
# read the file
df=pd.read_csv(path_data,sep='\t', names=['output','message'])
df.tail(10)

Unnamed: 0,output,message
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...
5563,ham,Ard 6 like dat lor.
5564,ham,Why don't you wait 'til at least wednesday to ...
5565,ham,Huh y lei...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [161]:
df.shape

(5572, 2)

In [162]:
df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [163]:
type(re.sub('[^a-zA-Z]',' ', df['message'][1]).split())

list

In [164]:
ps=PorterStemmer()  #WordNetLemmatizer can be used but it takes more time, here meaningful words are not necessary

In [165]:
#Data cleaning and Processing
new_mes=[]
for i in range(0,len(df)):
    clean_mes=re.sub('[^a-zA-Z]', ' ', df['message'][i])
    clean_mes=clean_mes.lower()
    clean_mes=clean_mes.split()
    clean_mes=[ps.stem(x) for x in clean_mes if not x in stopwords.words('english')]
    clean_mes=' '.join(clean_mes)
    new_mes.append(clean_mes)

In [166]:
len(new_mes)

5572

In [167]:
new_mes[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [168]:
#Creating bag of words model

cv=CountVectorizer(max_features=3000) # randomly selecting 3000 more frequent words out of 6296
X=cv.fit_transform(new_mes)

In [169]:
type(X)

scipy.sparse.csr.csr_matrix

In [170]:
X

<5572x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 41592 stored elements in Compressed Sparse Row format>

In [171]:
X=X.toarray() # converting from csr_matrix to array

In [172]:
type(X)

numpy.ndarray

In [173]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [174]:
X.shape # 5572 lines and 3000 words

(5572, 3000)

In [175]:
type(df['output'])

pandas.core.series.Series

In [176]:
df['output'].shape

(5572,)

In [177]:
df['output'].value_counts()

ham     4825
spam     747
Name: output, dtype: int64

In [178]:
y=pd.get_dummies(df['output'],drop_first=True)

In [179]:
y.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [180]:
type(y)

pandas.core.frame.DataFrame

In [181]:
y=y['spam'].values

In [182]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [183]:
y.shape

(5572,)

In [184]:
type(y)

numpy.ndarray

In [185]:
print('X.shape :- ', X.shape)
print('y.shape :- ', y.shape)

X.shape :-  (5572, 3000)
y.shape :-  (5572,)


In [186]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=10)

In [187]:
#Model creation

nb=MultinomialNB()  # Naive Bayes Model, works well for NLP
fit_model=nb.fit(X_train,y_train)


In [188]:
y_pred=fit_model.predict(X_test)

In [189]:
conf_mat=confusion_matrix(y_test,y_pred)
print(conf_mat)

[[1431   26]
 [  17  198]]


In [190]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('tn : ',tn )
print('fp : ',fp )
print('fn : ',fn )
print('tp : ',tp )

tn :  1431
fp :  26
fn :  17
tp :  198


In [191]:
acc=accuracy_score(y_test, y_pred)
print('fraction of correctly classified samples : ', acc)

fraction of correctly classified samples :  0.9742822966507177


In [192]:
acc=accuracy_score(y_test, y_pred, normalize= False)
print('number of correctly classified samples : ', acc)

number of correctly classified samples :  1629


In [193]:
precision= precision_score(y_test,y_pred)
recall= recall_score(y_test,y_pred)
f1_score= f1_score(y_test,y_pred)

In [194]:
print('precision : ',precision )
print('recall : ',recall )
print('f1_score : ',f1_score )

precision :  0.8839285714285714
recall :  0.9209302325581395
f1_score :  0.9020501138952165
