In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics
import scattertext as st
from pprint import pprint

In [12]:
filename = 'sms_data'
sms_df = pd.read_csv(filename, header=None, sep='	', names=['class_sms', 'sms'])
sms_df['sms'] = sms_df['sms'].str.replace('\d+', '')
sms_df['sms'] = sms_df['sms'].apply(lambda x: x.split())
sms_df['sms'] = sms_df['sms'].apply(lambda x: [word for word in x if len(word) > 2])
sms_df['sms'] = sms_df['sms'].apply(lambda x: ', '.join(x))
# print(sms_df['sms'])

X = sms_df.sms
y = sms_df.class_sms

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train) 
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)  
false_positive = X_test[(y_test == 'spam') & (y_pred_class == 'ham')]
false_negative = X_test[(y_test == 'ham') & (y_pred_class == 'spam')]
# print(false_positive)
# print(false_negative)

# print(y_test.value_counts())  # examine the class distribution of the testing set (using a Pandas Series method)
# print(y_test.value_counts().head(1) / len(y_test)) 
print(metrics.accuracy_score(y_test, y_pred_class)) # 0.9885
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_class)
false_positive_rate = confusion_matrix[1][0] / (confusion_matrix[1][0] + confusion_matrix[1][1])
print(false_positive_rate*100) # 5.405% <-- terrible!


943     How, about, getting, touch, with, folks, waiti...
5       FreeMsg, Hey, there, darling, it's, been, week...
3530    Xmas, New, Years, Eve, tickets, are, now, sale...
1875    Would, you, like, see, XXX, pics, they, are, h...
1893    CALL, LISTEN, EXTREME, DIRTY, LIVE, CHAT, GOIN...
4298    thesmszone.com, lets, you, send, free, anonymo...
4949    this, Amy,, will, sending, you, free, phone, n...
2821    INTERFLORA, It's, not, too, late, order, Inte...
2247    babe, goten, bout, me?', scammers, getting, sm...
4514          Money, have, won, wining, number, wot, next
Name: sms, dtype: object
574                             Waiting, for, your, call.
4773    Hi,, Mobile, no., &lt;#&gt;, has, added, you, ...
3375                          Also, andros, ice, etc, etc
45                         calls..messages..missed, calls
3415                               pic., Please, re-send.
1988                       calls..messages..missed, calls
Name: sms, dtype: object
0.988513998564
5.40540

In [None]:
# X_train_tokens = vect.get_feature_names()
ham_token_count = nb.feature_count_[0, :]
spam_token_count = nb.feature_count_[1, :]
tokens = pd.DataFrame({'token': X_train_tokens, 'ham': ham_token_count, 'spam': spam_token_count})
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens['ham_ratio'] = tokens.ham / tokens.spam
tokens['spam_ratio'] = tokens.spam / tokens.ham
print(tokens.sort_values('ham_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
print(tokens.sort_values('spam_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
# print(tokens.sort_values('one_star_ratio', ascending=False).head(10))  # top 10 tokens predictive for 1-star

In [None]:
# nlp = st.WhitespaceNLP.whitespace_nlp
# corpus = st.CorpusFromPandas(sms_df, 
#                               category_col='class_sms', 
#                               text_col='sms',
#                               nlp=nlp).build()
# term_freq_df = corpus.get_term_freq_df()
# term_freq_df['spam'] = corpus.get_scaled_f_scores('spam')
# pprint(list(term_freq_df.sort_values(by='spam', ascending=False).index[:10])) # words most associated with spam
# term_freq_df['ham'] = corpus.get_scaled_f_scores('ham')
# pprint(list(term_freq_df.sort_values(by='ham', ascending=False).index[:10]))  # words most associated with ham

In [None]:
# html = st.produce_scattertext_explorer(corpus,
#           category='spam',
#           category_name='spam',
#           not_category_name='ham',
#           width_in_pixels=1000,
#           )
# open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))