<a href="https://colab.research.google.com/github/cychen116/DataAnalysis_practice/blob/main/18_ML_NLP_SVM_spamPred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Practice homework:

Spam dataset: 
https://www.kaggle.com/uciml/sms-spam-collection-dataset

### Features: 
* Number of words
* "bag of words" --
** Split text into words, remove punctuation, lower case ✓
** Create a matrix where every word is a column ✓
** Values: word count in a message or 0 ✓

** Think about feature selection -- using coorrelation or P-value or chi-squarred or??? ✓

** Train and test an SVM; do a few iterations to refine feature selection ✓

In [1]:
#Data Basic
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style="white")

In [2]:
from google.colab import drive
drive.mount("/content/drive")
data_dir = "/content/drive/MyDrive/Colab Notebooks/EMSE 6575/Data"

Mounted at /content/drive


# Loading Data

In [3]:
df = pd.read_excel(data_dir + "/spam_resaved.xlsx")
df = pd.read_excel(data_dir + "/spam_resaved.xlsx")
df = df[['v1', 'v2']]
df.columns = ['category', 'text']
df['doc_id'] = df.index
df['doc_id'] = df['doc_id'].apply(lambda x: "doc_" + str(x))
print(df['category'].value_counts())
df.head()

ham     4825
spam     747
Name: category, dtype: int64


Unnamed: 0,category,text,doc_id
0,ham,"Go until jurong point, crazy.. Available only ...",doc_0
1,ham,Ok lar... Joking wif u oni...,doc_1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,doc_2
3,ham,U dun say so early hor... U c already then say...,doc_3
4,ham,"Nah I don't think he goes to usf, he lives aro...",doc_4


# Data Cleaning & Tokenize

In [4]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Minimal example
print(df['text'][0])
tokens = str(df['text'][0]).split() # split text
tokens = [token.lower() for token in tokens] # lower case
tokens = [re.sub(r'[^\w\s]','',token) for token in tokens] #Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement
tokens = [token for token in tokens if len(token) >= 3] 
tokens = [token for token in tokens if not token in stopwords.words()] #remove stop words
tokens

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


['jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'buffet',
 'got',
 'amore']

In [6]:
token_dfs = []
for i in range(0, len(df)):
  # split the text into tokens
  tokens = str(df['text'][i]).split()
  # drop case
  tokens = [token.lower() for token in tokens]
  
  # create a tidy structure for the parsed and cleaned tokens
  temp = pd.DataFrame({'token': tokens})
  
  # append  unique document id and text category
  temp['doc_id'] = df['doc_id'][i]
  temp['category'] = df['category'][i]

  # collect the temp token dataframes
  token_dfs.append(temp)

token_df = pd.concat(token_dfs)
print("Initial token count: " + str(len(token_df)))

#remove stop words
token_df = token_df[token_df['token'].isin(stopwords.words()) == False]
print("Drop stop words: " + str(len(token_df)))

# drop puncuation
token_df['token'] = token_df['token'].apply(lambda x: re.sub(r'[^\w\s]','',x))  

#remove rare words
rare_words = token_df['token'].value_counts().reset_index()
rare_words.columns = ['term', 'count']         
rare_words = rare_words['term'][rare_words['count'] <= 4].tolist()
token_df = token_df[token_df['token'].isin(rare_words) == False]
print("Drop rare words: " + str(len(token_df)))

#remove short words
token_df = token_df[token_df['token'].apply(len) >= 3]
print("Drop short words: " + str(len(token_df)))

token_df.head()

Initial token count: 86333
Drop stop words: 51728
Drop rare words: 39999
Drop short words: 33900


Unnamed: 0,token,doc_id,category
3,point,doc_0,ham
4,crazy,doc_0,ham
5,available,doc_0,ham
8,bugis,doc_0,ham
10,great,doc_0,ham


# Term Frequency Table

In [7]:
token_df['token'][df['category'] == 'spam'].value_counts().head(20)

call      198
get       128
know      107
like       82
got        81
ltgt       79
send       76
free       76
now        73
text       64
going      64
time       62
you        62
need       61
mobile     60
good       55
lor        54
home       53
day        52
back       52
Name: token, dtype: int64

In [8]:
token_df['token'][df['category'] == 'ham'].value_counts().head(20)

call     378
get      258
free     199
ltgt     197
good     179
like     160
now      158
got      157
day      150
know     150
love     147
ill      146
time     146
you      145
sorry    130
text     124
stop     118
txt      117
send     114
home     109
Name: token, dtype: int64

In [9]:
# drop common words seen in each category
common_words = ['call', 'get', 'free', 'ltgt', 'text', 'you', 'got', 'like',
                'send', 'now', 'txt', 'dont']
token_df = token_df[token_df['token'].isin(common_words) == False]
print("Drop common words across data categories: " + str(len(token_df)))

Drop common words across data categories: 30783


In [10]:
token_df['token'][df['category'] == 'spam'].value_counts().head(20)

know      107
going      64
time       62
need       61
mobile     60
good       55
lor        54
home       53
back       52
day        52
ill        51
see        50
still      49
love       48
today      47
week       45
right      45
make       43
way        43
phone      42
Name: token, dtype: int64

In [11]:
term_freq = (token_df
              .groupby(['doc_id', 'token'], as_index = False)
              .count())
term_freq.columns = ['doc_id', 'term', 'term_count']
term_freq.sort_values(by=['term_count'], ascending=False)

Unnamed: 0,doc_id,term,term_count
11892,doc_3015,happy,15
18059,doc_409,missing,6
854,doc_1139,missing,6
11888,doc_3015,day,6
3724,doc_1621,simple,5
...,...,...,...
10034,doc_2700,frnds,1
10033,doc_2700,feb,1
10031,doc_2700,day,1
10030,doc_2700,comes,1


In [12]:
df[df['doc_id'] == 'doc_409']

Unnamed: 0,category,text,doc_id
409,ham,Message:some text missing* Sender:Name Missing...,doc_409


#  Reshape TF table to the Document Term Matrix

In [13]:
doc_list = term_freq['doc_id'].unique().tolist()
doc_list = doc_list[0:3] # minimal example to verify code is working

reshape_dfs =[]
for doc in doc_list:
  temp = term_freq[['term_count']][term_freq['doc_id'] == doc].T
  temp.columns = term_freq['term'][term_freq['doc_id'] == doc].tolist()
  temp.index = [doc]
  reshape_dfs.append(temp)

doc_term_mat = pd.concat(reshape_dfs)
doc_term_mat = doc_term_mat.fillna(0)
doc_term_mat

Unnamed: 0,available,bugis,crazy,great,point,wat,world,joking,lar,wif,anymore,enough,gonna,home,ive,soon,stuff,talk,today,tonight
doc_0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
import time
from datetime import timedelta

doc_list = term_freq['doc_id'].unique().tolist()
reshape_dfs =[]
counter = 0
for doc in doc_list:
  counter += 1
  if counter % 500 == 0:
    print(str(counter) + " complete out of " + str(len(doc_list)) + " documents")
  temp = term_freq[['term_count']][term_freq['doc_id'] == doc].T
  temp.columns = term_freq['term'][term_freq['doc_id'] == doc].tolist()
  temp.index = [doc]
  reshape_dfs.append(temp)

print("\n merging stuff")
start_time = time.time()
doc_term_mat = pd.concat(reshape_dfs)
doc_term_mat = doc_term_mat.fillna(0)

print("--- %s time elapsed ---" % str(timedelta(seconds=time.time() - start_time)))
print(doc_term_mat.shape)

500 complete out of 5386 documents
1000 complete out of 5386 documents
1500 complete out of 5386 documents
2000 complete out of 5386 documents
2500 complete out of 5386 documents
3000 complete out of 5386 documents
3500 complete out of 5386 documents
4000 complete out of 5386 documents
4500 complete out of 5386 documents
5000 complete out of 5386 documents

 merging stuff
--- 0:03:33.578274 time elapsed ---
(5386, 1591)


# Term Selection for Spam Prediction

## Remove highly correlated features

In [15]:
doc_term_mat2 = doc_term_mat
doc_term_mat2['doc_id'] = doc_term_mat2.index
doc_term_mat2 = doc_term_mat2.merge(df[['doc_id', 'category']], how = 'left', on = 'doc_id')

In [16]:
def dichotomize_cat(txt):
  x = 0
  if txt == "spam":
    x = 1
  return x

doc_term_mat2['category'] = doc_term_mat2['category'].apply(lambda x: dichotomize_cat(x))
doc_term_mat2['category'].value_counts()

0    4643
1     743
Name: category, dtype: int64

In [17]:
cols = list(doc_term_mat2)
cols.insert(0, cols.pop(cols.index('doc_id')))
cols.insert(1, cols.pop(cols.index('category')))
doc_term_mat2 = doc_term_mat2[cols]

## Keep only significant variables

In [18]:
Y = doc_term_mat2.loc[:, 'category']  # all rows of 'diagnosis' 
X = doc_term_mat2.drop(['doc_id', 'category'], axis=1)

from sklearn.model_selection import train_test_split as tts
X.insert(loc=len(X.columns), column='intercept', value=1) #### column of 1's
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

In [19]:
import statsmodels.api as sm
regression_ols = sm.OLS(y_train, X_train).fit()
pvalues = regression_ols.pvalues 
sig_vars = []
for i in range(0, len(pvalues)):
  if pvalues[i] <= 0.5:
    sig_vars.append(pvalues.index[i])
print(len(sig_vars))
sig_vars[0:20]

  import pandas.util.testing as tm


850


['crazy',
 'wat',
 'lar',
 'enough',
 'gonna',
 'home',
 'ive',
 'talk',
 'today',
 'long',
 'pick',
 'price',
 'heard',
 'tat',
 'cash',
 'guaranteed',
 'please',
 'prize',
 'representative',
 'service']

In [20]:
X_train = X_train[sig_vars]
X_test = X_test[sig_vars]

regression_ols = sm.OLS(y_train, X_train).fit()
pvalues = regression_ols.pvalues 
sig_vars = []
for i in range(0, len(pvalues)):
  if pvalues[i] <= 0.1:
    sig_vars.append(pvalues.index[i])
print(len(sig_vars))
sig_vars[0:20]

480


['crazy',
 'enough',
 'gonna',
 'today',
 'pick',
 'price',
 'heard',
 'cash',
 'guaranteed',
 'please',
 'prize',
 'representative',
 'service',
 'â5000',
 'told',
 'give',
 '08000839402',
 'mobileupd8',
 'nokia',
 'orange']

In [21]:
reg_strength = 1000

### Formula in Slide 8
def compute_cost(W, X, Y):
  N = X.shape[0]
  distances = 1 - Y * (np.dot(X, W))
  distances[distances < 0] = 0
  hinge_loss = reg_strength * (np.sum(distances) / N)
  cost = 1 / 2 * np.dot(W, W) + hinge_loss
  return(cost)

def calculate_cost_gradient(W, X_batch, Y_batch):
    if type(Y_batch) != np.array: #float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])

    distance = 1 - (Y_batch * np.dot(X_batch, W))
#    if type(distance)== np.float64:
#      distance = np.array([distance])
    dw = np.zeros(len(W))

    #### for every dimension, apply formula in Slide 9
    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (reg_strength * Y_batch[ind] * X_batch[ind])
        dw += di  
    dw = dw/len(Y_batch)  # average
    return dw

from sklearn.utils import shuffle
def sgd(features, outputs, max_epochs=5000, learning_rate=0.05,cost_threshold = 0.01):
  weights = np.zeros(features.shape[1])
  prev_cost = float("inf")
  nth = 0
  for epoch in range(1, max_epochs):
    X, Y = shuffle(features, outputs)
    for ind, x in enumerate(X):  #### itereate through the dimensions
      ascent = calculate_cost_gradient(weights, x, Y[ind])
      weights = weights - (learning_rate * ascent)

    if epoch == 2 ** nth or epoch == max_epochs - 1:
      cost = compute_cost(weights, features, outputs)
      print("Epoch is:{} and Cost is: {}".format(epoch, cost))

      #### If diffrence in the cost from last ^2 iterations to now is < 1%: get out
      if abs(prev_cost - cost) < cost_threshold * prev_cost:
            return weights
      prev_cost = cost
      nth += 1
  return weights

In [22]:
import numpy as np
X_train = X_train[sig_vars]
X_test = X_test[sig_vars]

W = sgd(X_train.to_numpy(), y_train.to_numpy())

from sklearn.metrics import accuracy_score, recall_score, precision_score 
def test():
  y_test_predicted = np.array([])
  for i in range(X_test.shape[0]):
      yp = np.sign(np.dot(W, X_test.to_numpy()[i])) #model
      y_test_predicted = np.append(y_test_predicted, yp)
  print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_test_predicted)))
  print("recall on test dataset: {}".format(recall_score(y_test.to_numpy(), y_test_predicted)))
  print("precision on test dataset: {}".format(precision_score(y_test.to_numpy(), y_test_predicted)))

test()

Epoch is:1 and Cost is: 916.9126120106954
Epoch is:2 and Cost is: 3508.934350288637
Epoch is:4 and Cost is: 11059.891259241971
Epoch is:8 and Cost is: 979.4990235813021
Epoch is:16 and Cost is: 3133.5356435968833
Epoch is:32 and Cost is: 880.8057349166486
Epoch is:64 and Cost is: 1962.9599267581166
Epoch is:128 and Cost is: 879.2176581074859
Epoch is:256 and Cost is: 1506.7658122262671
Epoch is:512 and Cost is: 868.185276870133
Epoch is:1024 and Cost is: 865.6314050822336
accuracy on test dataset: 0.14285714285714285
recall on test dataset: 1.0
precision on test dataset: 0.14285714285714285


Precision of the model is very low. Let's try sklearn.

# Sklearn Model

In [24]:
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from statistics import mean
from numpy import std


In [25]:
def eval_model(model, print_text, X, y):
  cv = KFold(n_splits=10) 
  accuracy = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv)
  accuracy = list(accuracy)
  f1_scores = cross_val_score(model, X, y, scoring = 'f1', cv = cv)
  f1_scores = list(f1_scores)
  precision_scores = cross_val_score(model, X, y, scoring = 'precision', cv = cv)
  precision_scores = list(precision_scores)
  recall_scores = cross_val_score(model, X, y, scoring = 'recall', cv = cv)
  recall_scores = list(recall_scores)

  print(print_text)
  print('accuracy score: ' + str(mean(accuracy))[0:5] + " +/- " + str(std(accuracy))[0:5])
  print('f1 score: ' + str(mean(f1_scores))[0:5] + " +/- " + str(std(f1_scores))[0:5])
  print('precision: ' + str(mean(precision_scores))[0:5] + " +/- " + str(std(precision_scores))[0:5])
  print('recall: '+ str(mean(recall_scores))[0:5] + " +/- " + str(std(recall_scores))[0:5] + "\n")

In [26]:
eval_model(model = GaussianNB(), X = X_train, y = y_train, print_text="Naive Bayes")
eval_model(model = SVC(), X = X_train, y = y_train, print_text="Support Vector Machine")
eval_model(model = RandomForestClassifier(), X = X_train, y = y_train, print_text="Random Forest")
eval_model(model = KNeighborsClassifier(), X = X_train, y = y_train, print_text="KNN")

Naive Bayes
accuracy score: 0.965 +/- 0.006
f1 score: 0.876 +/- 0.024
precision: 0.851 +/- 0.036
recall: 0.905 +/- 0.044

Support Vector Machine
accuracy score: 0.972 +/- 0.007
f1 score: 0.896 +/- 0.029
precision: 0.923 +/- 0.039
recall: 0.871 +/- 0.039

Random Forest
accuracy score: 0.971 +/- 0.006
f1 score: 0.882 +/- 0.028
precision: 0.956 +/- 0.026
recall: 0.824 +/- 0.054

KNN
accuracy score: 0.918 +/- 0.009
f1 score: 0.576 +/- 0.055
precision: 0.996 +/- 0.012
recall: 0.408 +/- 0.054



The model performance is much better.

In [27]:
def remove_correlated_features(X, corr_threshold=0.9):
  corr = X.corr()
  drop_columns = np.full(corr.shape[0], False, dtype=bool)
  for i in range(corr.shape[0]):
      for j in range(i + 1, corr.shape[0]):
          if corr.iloc[i, j] >= corr_threshold:
              drop_columns[j] = True
  columns_dropped = X.columns[drop_columns]
  print("dropping",columns_dropped)
  X_drop = X.drop(columns_dropped, axis=1)
  return X_drop

In [28]:
X = doc_term_mat2.drop(['doc_id', 'category'], axis=1)
X_drop = remove_correlated_features(X, corr_threshold=0.9)

dropping Index(['norm150ptone', 'w45wq', 'callertune', 'onto', 'cashin', 'maximize',
       'identifier', 'statement', 'unredeemed', 'quoting', 'standard', 'voda',
       '1327', '20p', '5wb', 'cr9', 'croydon', 'yan', '800', 'specialcall',
       'ufind', 'inviting', 'sae', 'costa', 'costâ150pm', 'max10mins',
       'pobox334', 'sk38xh', 'sol', 'stockport', 'toclaim', 'â300', 'ip4',
       'latr', 'minuts', 'sed', 'murderer', 'java', 'noline', 'rentl',
       'videochat', 'videophones', 'camcorder', 'wer'],
      dtype='object')


In [29]:
import statsmodels.api as sm
def remove_less_significant_features(X, Y, sl=0.05):
  regression_ols = None
  columns_dropped = np.array([])
  #for itr in range(0, len(X.columns)):
  for itr in range(0, 60):
    if itr % 5 == 0:
      print("Iteration number " + str(itr))
    regression_ols = sm.OLS(Y, X).fit()
    max_col = regression_ols.pvalues.idxmax()
    max_val = regression_ols.pvalues.max()
    if max_val > sl:
        X.drop(max_col, axis='columns', inplace=True)
        columns_dropped = np.append(columns_dropped, [max_col])
    else:
        break
  regression_ols.summary()
  return columns_dropped

In [30]:
start_time = time.time()
cols_dropped =  remove_less_significant_features(X_train,y_train, sl=0.05)
print("--- %s time elapsed ---" % str(timedelta(seconds=time.time() - start_time)))

Iteration number 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Iteration number 5
Iteration number 10
Iteration number 15
Iteration number 20
Iteration number 25
Iteration number 30
Iteration number 35
Iteration number 40
Iteration number 45
Iteration number 50
Iteration number 55
--- 0:00:36.703276 time elapsed ---


In [31]:
cols_dropped

array(['lover', 'vry', 'crazy', 'gal', 'totally', 'game', 'keep',
       'created', 'gift', 'god', 'ntt', 'bill', 'theatre', 'gonna',
       'midnight', 'fighting', 'give', 'confirm', 'stay', 'unable',
       'empty', 'without', 'cuz', 'hand', 'unlimited', 'sign', 'sport',
       'wins', 'rooms', 'congratulations', 'news', 'thank', 'horny',
       'girls', 'india', 'reason', 'light', 'wherever', 'problems',
       'intercept', 'mind', 'giving', 'rply', 'eerie', 'cancel', 'told',
       'wer', 'move', 'gives', 'ill', 'version', 'calls', 'class', 'five',
       '86688', 'cool', 'yrs', 'thats', 'looking', 'google'], dtype='<U32')