In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import scattertext as st
from pprint import pprint

In [2]:
filename = 'sms_data'
sms_df = pd.read_csv(filename, header=None, sep='	', names=['class_sms', 'sms'])
X = sms_df.sms
y = sms_df.class_sms
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer(stop_words='english')
X_train_dtm = vect.fit_transform(X_train) 
X_test_dtm = vect.transform(X_test)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)  
# print(y_test.value_counts())  # examine the class distribution of the testing set (using a Pandas Series method)
# print(y_test.value_counts().head(1) / len(y_test)) 
print(metrics.accuracy_score(y_test, y_pred_class)) 
# print(metrics.confusion_matrix(y_test, y_pred_class))

0.987796123475


In [3]:
X_train_tokens = vect.get_feature_names()
ham_token_count = nb.feature_count_[0, :]
spam_token_count = nb.feature_count_[1, :]
tokens = pd.DataFrame({'token': X_train_tokens, 'ham': ham_token_count, 'spam': spam_token_count})
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
tokens['ham_ratio'] = tokens.ham / tokens.spam
tokens['spam_ratio'] = tokens.spam / tokens.ham
print(tokens.sort_values('ham_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
print(tokens.sort_values('spam_ratio', ascending=False).head(10))  # top 10 tokens predictive for 5-star
# print(tokens.sort_values('one_star_ratio', ascending=False).head(10))  # top 10 tokens predictive for 1-star

           ham      spam  token  ham_ratio  spam_ratio
3044  0.064971  0.001779     gt  36.513685    0.027387
3974  0.064142  0.001779     lt  36.047553    0.027741
2015  0.032900  0.001779     da  18.489909    0.054084
3932  0.032900  0.001779    lor  18.489909    0.054084
3762  0.030688  0.001779  later  17.246890    0.057981
1794  0.048936  0.003559   come  13.750899    0.072723
1032  0.019630  0.001779    ask  11.031794    0.090647
6891  0.019630  0.001779    way  11.031794    0.090647
2259  0.019077  0.001779  doing  10.721040    0.093275
7149  0.017694  0.001779   yeah   9.944153    0.100562
           ham      spam       token  ham_ratio  spam_ratio
1718  0.000276  0.158363       claim   0.001746  572.798932
5043  0.000276  0.135231       prize   0.002044  489.131673
293   0.000276  0.087189        150p   0.003171  315.361210
6470  0.000276  0.085409        tone   0.003237  308.925267
3048  0.000276  0.076512  guaranteed   0.003613  276.745552
307   0.000276  0.069395          1

In [5]:
nlp = st.WhitespaceNLP.whitespace_nlp
corpus = st.CorpusFromPandas(sms_df, 
                              category_col='class_sms', 
                              text_col='sms',
                              nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df['spam'] = corpus.get_scaled_f_scores('spam')
pprint(list(term_freq_df.sort_values(by='spam', ascending=False).index[:10])) # words most associated with spam
term_freq_df['ham'] = corpus.get_scaled_f_scores('ham')
pprint(list(term_freq_df.sort_values(by='ham', ascending=False).index[:10]))  # words most associated with ham

['150p',
 'claim',
 'guaranteed',
 'prize',
 'have won',
 '18',
 'your mobile',
 'co uk',
 'tone',
 'to claim']
['i ll', 'lt gt', 'he', 'lt', 'gt', 'ü', 'she', 'lor', 'da', 'later']


In [6]:
html = st.produce_scattertext_explorer(corpus,
          category='spam',
          category_name='spam',
          not_category_name='ham',
          width_in_pixels=1000,
          metadata=sms_df['sms'])
open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))

1687602