# ML models with textual data

In [7]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split

import pickle

## Load data

In [8]:
url = 'https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'

In [9]:
data = pd.read_csv(url, encoding='latin1', usecols=['v1','v2'])
data.columns = ['label','raw_text']
data.head()

Unnamed: 0,label,raw_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Treat target

In [10]:
data['label'] = data.label.replace({'spam':1,'ham':0})
data.head()

Unnamed: 0,label,raw_text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data.label.value_counts(normalize=True)

0    0.865937
1    0.134063
Name: label, dtype: float64

## Preprocess features

In [12]:
def _remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def _remove_number(string):
    return(re.sub(r'[0-9]*','',string))

def _tokenization(text):
    tokens = re.split(' ',text)
    return tokens

def _remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    output= [i for i in text if i not in stopwords]
    return output

def _lemmatizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def preprocess(data):

    data['text'] = data.raw_text
    data['text'] = data.text.apply(_remove_punctuation)
    data['text']= data['text'].apply(lambda x: x.lower())
    data['text'] = data.text.apply(lambda x: _remove_number(x))
    data['text'] = data.text.apply(_tokenization)
    data['text'] = data.text.apply(_remove_stopwords)
    data['text'] = data['text'].apply(lambda x:_lemmatizer(x))
    data['text'] = data['text'].apply(lambda x: ' '.join(x))
    data = data.drop(columns=['raw_text'])
    
    return(data)

data = preprocess(data)
data.head()

Unnamed: 0,label,text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st...
3,0,u dun say early hor u c already say
4,0,nah dont think go usf life around though


In [13]:
data.shape

(5572, 2)

## Split train and test data

In [14]:
Xtrn, Xtst, ytrn, ytst = train_test_split(data['text'], data['label'], test_size=500, random_state=123)
Xtrn.shape, Xtst.shape, ytrn.shape, ytst.shape

((5072,), (500,), (5072,), (500,))

## Create DTM

In [15]:
vectorizer = TfidfVectorizer()
Xtrn = vectorizer.fit_transform(Xtrn).toarray()
Xtst = vectorizer.transform(Xtst).toarray()

obs.: `toarray` is required by sklearn because it doesn't know how to work with sparse arrays.

In [16]:
Xtrn.shape, Xtst.shape

((5072, 7520), (500, 7520))

## Fit models

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
models = {'lr': LogisticRegression(), 
          'gnb':GaussianNB(),
          'tree': DecisionTreeClassifier(),
          'rf':RandomForestClassifier(),
          'xgb': XGBRFClassifier(),
          'svm': SVC(),
          'knn': KNeighborsClassifier()
         }

performance = pd.DataFrame([], index=models.keys(), columns=['accuracy','precision','recall','F1'])
performance

Unnamed: 0,accuracy,precision,recall,F1
lr,,,,
gnb,,,,
tree,,,,
rf,,,,
xgb,,,,
svm,,,,
knn,,,,


Warning: The next cell takes ~11 minutes to run

In [19]:
%%time
for name, model in models.items():
    
    print('Fitting ' + name)
    
    #Fit model
    models[name] = model.fit(Xtrn, ytrn)
    yhat = model.predict(Xtst)
    
    #Calculate metrics
    accuracy = accuracy_score(ytst, yhat)
    precision = precision_score(ytst, yhat)
    recall = recall_score(ytst, yhat)
    f1 = f1_score(ytst, yhat)
    
    #Fill performance dataframe
    performance.loc[name,:] = accuracy, precision, recall, f1

#Order performance dataframe and show
performance = performance.sort_values('F1', ascending=False)
performance

Fitting lr
Fitting gnb
Fitting tree
Fitting rf
Fitting xgb
Fitting svm
Fitting knn
CPU times: total: 6min 55s
Wall time: 3min 40s


Unnamed: 0,accuracy,precision,recall,F1
svm,0.978,1.0,0.825397,0.904348
rf,0.97,1.0,0.761905,0.864865
tree,0.956,0.859649,0.777778,0.816667
lr,0.958,0.977273,0.68254,0.803738
xgb,0.948,0.911111,0.650794,0.759259
gnb,0.884,0.524752,0.84127,0.646341
knn,0.93,1.0,0.444444,0.615385


## Get best model and save it

In [20]:
best_model_name = performance.index[0]
best_model = models[best_model_name]
best_model

In [21]:
with open('spam_classifier.pkl','wb') as f:
    pickle.dump(best_model, f)