## 2. Model Train And Test

### 2.1 Train and Test on raw data after removing stopwords and using Tf-Idf but No Extra Feature

In [1]:
import sklearn
import pandas as pd
from IPython.display import display
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
smsData = pd.read_csv('C:\\DataSets\\SmsSpam\\TRAIN_SMS.csv',encoding = "ISO-8859-1")

#### Remove Stop Words and Encode Labels

In [3]:
stopList = list(stopwords.words('english')) # Stopwords
cleaned = []
for i in range(len(smsData['Message'])):
    clean = smsData['Message'][i]
    clean = clean.lower().split()
    clean = [word for word in clean if word not in stopList]
    clean = ' '.join(str(w) for w in clean)
    #print(clean)
    cleaned.append(clean)

intLabel = smsData['Label'].copy()
i=0
for l in intLabel:
    if l == 'ham':
        intLabel[i]=0
    elif l == 'spam':
        intLabel[i]=1
    else:
        intLabel[i]=2
    i=i+1

In [4]:
smsData.insert(loc=2, column="Cleaned", value=cleaned)
smsData.insert(loc=0, column="IntLabel", value=intLabel)
smsData.head()

Unnamed: 0,IntLabel,Label,Message,Cleaned
0,0,ham,oh how abt 2 days before Christmas,oh abt 2 days christmas
1,2,info,"Welcome to OVATION HOLD R.No. 184, 114, 395, 3...","welcome ovation hold r.no. 184, 114, 395, 378 ..."
2,2,info,Thank you for using your ICICI bank CREDITcard...,thank using icici bank creditcard ending 5253 ...
3,0,ham,schedule a meeting with the entire team in the...,schedule meeting entire team office tomorrow
4,0,ham,Tommy is my brother,tommy brother


In [5]:
vectorizer = TfidfVectorizer("english")
X = vectorizer.fit_transform(smsData['Cleaned'])
#print(X)
Y = smsData['IntLabel'].values.astype('int')
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y,test_size=.4) #stratify to ensure proportion of classes

In [6]:
from sklearn.svm import LinearSVC
classifier = OneVsRestClassifier(LinearSVC(max_iter=3000))
classifier.fit(X_train, Y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=3000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None)

In [7]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
predicted = classifier.predict(X_test)
print ("Accuracy Score : ",accuracy_score(Y_test, predicted))
print ("F1 Score : ",accuracy_score(Y_test, predicted))
print ("Confusion Matrix : \n",confusion_matrix(Y_test, predicted))
print ('\nClasification report:\n', classification_report(Y_test, predicted))

Accuracy Score :  0.9924166666666666
F1 Score :  0.9924166666666666
Confusion Matrix : 
 [[3985   11    4]
 [  76 2574    0]
 [   0    0 5350]]

Clasification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      4000
           1       1.00      0.97      0.98      2650
           2       1.00      1.00      1.00      5350

   micro avg       0.99      0.99      0.99     12000
   macro avg       0.99      0.99      0.99     12000
weighted avg       0.99      0.99      0.99     12000



### Above results are just by using raw data with little cleaning, it would be nice to see if extra features like message length etc., analyzed in data exploration phase help in improving our result

### 2.2 Including Message Length As Feature

In [11]:
#length of message in its original form, not the cleaned one
import numpy as np
from scipy.sparse import csr_matrix,hstack,csr_matrix
smsData['length'] = smsData['Message'].map(lambda msg: len(msg))
smsData['msgTfIdf']=list(X)
metadata = csr_matrix(smsData['length'].values).T
NewData = hstack([X, metadata]).tocsr()

X_train, X_test, Y_train, Y_test = train_test_split(NewData,Y,stratify=Y,test_size=.4) #stratify to ensure proportion of classes
classifier = OneVsRestClassifier(LinearSVC(max_iter=3000))
classifier.fit(X_train, Y_train)
predicted = classifier.predict(X_test)
print ("Accuracy Score : ",accuracy_score(Y_test, predicted))
print ("F1 Score : ",accuracy_score(Y_test, predicted))
print ("Confusion Matrix : \n",confusion_matrix(Y_test, predicted))
print ('\nClasification report:\n', classification_report(Y_test, predicted))



Accuracy Score :  0.9961666666666666
F1 Score :  0.9961666666666666
Confusion Matrix : 
 [[3985   15    0]
 [  29 2619    2]
 [   0    0 5350]]

Clasification report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4000
           1       0.99      0.99      0.99      2650
           2       1.00      1.00      1.00      5350

   micro avg       1.00      1.00      1.00     12000
   macro avg       1.00      0.99      1.00     12000
weighted avg       1.00      1.00      1.00     12000



### As we see there is an increase in accuracy by .4 % by including message length in the data.
### We might need to scale the features to address the convergence problem as SVM tends to be sensitive to features scaling
#### We may hope to expect further increase in accuracy by inculing other features that we discovered in data exploration phase