In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import scattertext as st
from pprint import pprint

In [2]:
filename = 'sms_data'
sms_df = pd.read_csv(filename, header=None, sep='	', names=['class_sms', 'sms'])
sms_df['sms'] = sms_df['sms'].str.replace('\d+', '')
sms_df['sms'] = sms_df['sms'].apply(lambda x: x.split())
sms_df['sms'] = sms_df['sms'].apply(lambda x: [word for word in x if len(word) > 2])
sms_df['sms'] = sms_df['sms'].apply(lambda x: ', '.join(x))
# print(sms_df['sms'])

X = sms_df.sms
y = sms_df.class_sms

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer(stop_words='english')
X_train_dtm = vect.fit_transform(X_train) 
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)  
false_positive = X_test[(y_test == 'spam') & (y_pred_class == 'ham')]
false_negative = X_test[(y_test == 'ham') & (y_pred_class == 'spam')]
print(false_positive)
print(false_negative)


# print(y_test.value_counts())  # examine the class distribution of the testing set (using a Pandas Series method)
# print(y_test.value_counts().head(1) / len(y_test)) 
print(metrics.accuracy_score(y_test, y_pred_class)) 
print(metrics.confusion_matrix(y_test, y_pred_class))

2295               You, have, new, message., Please, call
5110               You, have, new, message., Please, call
3530    Xmas, New, Years, Eve, tickets, are, now, sale...
684     I'm, sue., years, old, and, work, lapdancer., ...
1893    CALL, LISTEN, EXTREME, DIRTY, LIVE, CHAT, GOIN...
2941               You, have, new, message., Please, call
2821    INTERFLORA, It's, not, too, late, order, Inte...
2247    babe, goten, bout, me?', scammers, getting, sm...
4514          Money, have, won, wining, number, wot, next
Name: sms, dtype: object
4419                          When, you, get, free,, call
1587    There, are, other, charges, after, transfer, c...
2903    Bill,, in:, Are, there, any, letters, for, me....
45                         calls..messages..missed, calls
3589    you, were/are, free, can, give., Otherwise, na...
2162    she, replying., Has, boye, changed, his, phone...
3415                               pic., Please, re-send.
1988                       calls..messages..mis

In [3]:
X_train_tokens = vect.get_feature_names()
ham_token_count = nb.feature_count_[0, :]
spam_token_count = nb.feature_count_[1, :]
tokens = pd.DataFrame({'token': X_train_tokens, 'ham': ham_token_count, 'spam': spam_token_count})
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens['ham_ratio'] = tokens.ham / tokens.spam
tokens['spam_ratio'] = tokens.spam / tokens.ham
print(tokens.sort_values('ham_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
print(tokens.sort_values('spam_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
# print(tokens.sort_values('one_star_ratio', ascending=False).head(10))  # top 10 tokens predictive for 1-star

           ham      spam  token  ham_ratio  spam_ratio
2272  0.063865  0.001779     gt  35.892176    0.027861
3176  0.063865  0.001779     lt  35.892176    0.027861
3133  0.032900  0.001779    lor  18.489909    0.054084
2960  0.030688  0.001779  later  17.246890    0.057981
3761  0.027647  0.001779     ok  15.537738    0.064359
1035  0.048936  0.003559   come  13.750899    0.072723
6041  0.019630  0.001779    way  11.031794    0.090647
309   0.019630  0.001779    ask  11.031794    0.090647
1496  0.019077  0.001779  doing  10.721040    0.093275
6303  0.017694  0.001779   yeah   9.944153    0.100562
           ham      spam       token  ham_ratio  spam_ratio
963   0.000276  0.158363       claim   0.001746  572.798932
4221  0.000276  0.135231       prize   0.002044  489.131673
5772  0.000276  0.090747          uk   0.003047  328.233096
5635  0.000276  0.085409        tone   0.003237  308.925267
2277  0.000276  0.076512  guaranteed   0.003613  276.745552
4152  0.000276  0.069395         pp

In [4]:
nlp = st.WhitespaceNLP.whitespace_nlp
corpus = st.CorpusFromPandas(sms_df, 
                              category_col='class_sms', 
                              text_col='sms',
                              nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df['spam'] = corpus.get_scaled_f_scores('spam')
pprint(list(term_freq_df.sort_values(by='spam', ascending=False).index[:10])) # words most associated with spam
term_freq_df['ham'] = corpus.get_scaled_f_scores('ham')
pprint(list(term_freq_df.sort_values(by='ham', ascending=False).index[:10]))  # words most associated with ham

['prize',
 'have won',
 'your mobile',
 'claim',
 'tone',
 'guaranteed',
 'ppm',
 'awarded',
 'co uk',
 'uk']
['lt', 'she', 'i ll', 'lor', 'gt', 'lt gt', 'later', 'ask', 'said', 'doing']


In [7]:
html = st.produce_scattertext_explorer(corpus,
          category='spam',
          category_name='spam',
          not_category_name='ham',
          width_in_pixels=1000,
          )
open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))

1165447