# NLP Modeling

In [1]:
# basic imports
import pandas as pd
import numpy as np

# visualization
%matplotlib inline
import matplotlib.pyplot as plt

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# classification 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

#local modules
import acquire as a
import prepare as p
import explore as e


In [2]:
df = p.prep_spam(a.get_spam_data())

In [3]:
df.head()

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif oni
2,spam,free entry wkly comp win fa cup final tkts 21s...
3,ham,dun say early hor c already say
4,ham,nah dont think goes usf lives around though


In [4]:
#split the data
train, validate, test = e.split_data(df, target='label')

In [5]:
# separating features from target 
X_train = train.text
X_validate = validate.text
X_test = test.text
y_train = train.label
y_validate = validate.label
y_test = test.label

In [6]:
tfidf = TfidfVectorizer()
X_train_tf_idf = tfidf.fit_transform(X_train)
X_test_tf_idf = tfidf.transform(X_test)

lm = LogisticRegression().fit(X_train_tf_idf, y_train)

y_train = pd.DataFrame(dict(actual=y_train))
y_test = pd.DataFrame(dict(actual=y_test))

y_train['predicted'] = lm.predict(X_train_tf_idf)
y_test['predicted'] = lm.predict(X_test_tf_idf)

In [7]:
print('Accuracy: {:.2%}'.format(accuracy_score(y_train.actual, y_train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(y_train.predicted, y_train.actual))
print('---')
print(classification_report(y_train.actual, y_train.predicted))


Accuracy: 95.64%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        2700   134
spam          2   284
---
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      2702
        spam       0.99      0.68      0.81       418

    accuracy                           0.96      3120
   macro avg       0.97      0.84      0.89      3120
weighted avg       0.96      0.96      0.95      3120



In [8]:
# now let's try the count vectorizer and compare the two
# for the same model

cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

ValueError: If using all scalar values, you must pass an index

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))
