In [1]:
%autosave 0

Autosave disabled


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from env import get_connection
from prepare import clean, lemmatize

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/edwige/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/edwige/nltk_data...


True

In [4]:
url = get_connection('spam_db')

query = 'SELECT * FROM spam'

df = pd.read_sql(query, url, index_col='id')

df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['clean_text'] = df.text.apply(clean, args=['us'])
df.head()

Unnamed: 0_level_0,label,text,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though


In [6]:
df['lemmas'] = df.clean_text.apply(lemmatize)
df.head()

Unnamed: 0_level_0,label,text,clean_text,lemmas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though,nah ' think go usf life around though


In [7]:
X = df.lemmas
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
                                                    random_state=42)

In [26]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf[:5]

<5x7197 sparse matrix of type '<class 'numpy.float64'>'
	with 49 stored elements in Compressed Sparse Row format>

we have generated over 7,000 features from our dataset! We cacn feed these features into a machine learning model and generate predictions.

In [9]:
lm = LogisticRegression()

lm.fit(X_train_tfidf, y_train)

In [10]:
y_train_results = pd.DataFrame({'actual':y_train,
                               'preds': lm.predict(X_train_tfidf)})

In [12]:
y_train_results.head()

Unnamed: 0_level_0,actual,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
708,spam,spam
4338,ham,ham
5029,ham,ham
4921,ham,ham
2592,ham,ham


In [14]:
print(classification_report(y_train_results.actual, y_train_results.preds))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3372
        spam       0.99      0.74      0.85       528

    accuracy                           0.96      3900
   macro avg       0.98      0.87      0.91      3900
weighted avg       0.96      0.96      0.96      3900



In [15]:
y_test_res = pd.DataFrame({'acutal':y_test,
                          'preds':lm.predict(X_test_tfidf)})

In [16]:
y_test_res

Unnamed: 0_level_0,acutal,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3245,ham,ham
944,ham,ham
1044,spam,ham
2484,ham,ham
812,spam,spam
...,...,...
2505,ham,ham
2525,ham,ham
4975,ham,ham
650,ham,ham


In [19]:
print(classification_report(y_test_res.acutal, y_test_res.preds))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1453
        spam       0.96      0.64      0.77       219

    accuracy                           0.95      1672
   macro avg       0.95      0.82      0.87      1672
weighted avg       0.95      0.95      0.94      1672

