In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import xgboost as xgb
import cPickle as pickle
import numpy as np
import pandas as pd
# import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.style.use('ggplot')


In [2]:
stemmer = snowball.SnowballStemmer("english")

def stem_tokens(tokens, stemmer):
    '''Stem the tokens.'''
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def tokenize(text):
    '''Tokenize & stem. Stems automatically for now.
    Leaving "stemmer" out of function call, so it works with TfidfVectorizer'''
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


In [3]:
# write function to grab the feature importances & show top ~20

def top_features_words(d,vect, n=20):
    '''
    Function to show the top n important features, their scores,and corresponding words.
    The get_fscore method in xgboost returns a dictionary of features & a number.
    Get the top n features with the highest scores
    
    d is a dictionary (from bst.get_fscore() in xgboost)
    vect is the instantiated vectorizer (e.g. vect = TfidfVectorizer(stuff); not the fitted variable name)
    '''
    
    # Back out important features

    dicta = vect.vocabulary_
    dictb = dict ( (v,k) for k, v in dicta.items() )
    # dictb[featurenum] returns the word.

    featureslist = []
    for k, v in sorted(d.iteritems(), reverse=True, key=lambda (k,v): (v,k)):
        featureslist.append((k,v))
    
    topfeatures = []
    for i in xrange(n):
        fname = featureslist[i][0]
        fnum=int(filter(lambda x: x.isdigit(),fname))
        topfeatures.append((featureslist[i][0],featureslist[i][1],dictb[fnum]))
    
    return topfeatures
    

In [None]:
#     To load saved model:
#     bst = xgb.Booster({'nthread': 4}) #init model
#     bst.load_model("model.bin")  <-- I think this should be "hatespeech.model"

#     Or, if pickled:
#     bst = pickle.load(open('bst.p', 'rb'))

In [None]:
# In order to predict:
#  Need to fit comment to tf-idf vector
#  convert tf-idf to xgb dmatrix
#  Call that in prediction

# yprob = bst.predict( xg_test ).reshape( y_test.shape[0], 5 )
# xg_test = xgb.DMatrix(tfidfv_fit_X_test, label=y_test)
# tfidfv_fit_X_test = tfidfv.transform(X_test)
# X_test : "an iterable which yields either str, unicode or file objects"
# e.g. X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))


# get prediction, this is in 1D array, need reshape to (ndata, nclass)
#yprob = bst.predict( xg_test ).reshape( y_test.shape[0], 5 )

In [4]:
bst = xgb.Booster()
bst.load_model('../FinalModel/BuildModel/hatespeech.model')

In [5]:
# load vectors
tfidf_X = pickle.load(open('../FinalModel/BuildModel/tfidf_X.p', 'rb'))
vect = pickle.load(open('../FinalModel/BuildModel/vect.p', 'rb'))

In [6]:
classes=['Not Hate', 'Size Hate', 'Gender Hate', 'Race Hate', 'Religion Hate']

In [44]:
comment = [raw_input('Enter comment: ')]

Enter comment: I think fat people are not jerks.


In [51]:
comment_tfidf = vect.transform(comment) # double check... is this transforming to test vector?
comment_xgb = xgb.DMatrix(comment_tfidf)
yprob = bst.predict(comment_xgb).reshape(1,5) # hard coding -- only one comment at a time in this case.

ylabel = classes[np.argmax(yprob, axis=1)]

print('The class is: {0} with probability {1}%'.format(ylabel, round(100*np.max(yprob),1)))

The class is: Size Hate with probability 83.8%


In [8]:
top_features_words(bst.get_fscore(),vect, n=50)

[('f255261', 852, u'like'),
 ('f310038', 768, u'peopl'),
 ('f242158', 761, u'just'),
 ('f128396', 637, u'fat'),
 ('f109214', 530, u'dont'),
 ('f227631', 431, u'im'),
 ('f55520', 401, u'becaus'),
 ('f139322', 366, u'fuck'),
 ('f395295', 360, u'think'),
 ('f263062', 296, u'make'),
 ('f352019', 279, u'say'),
 ('f433573', 277, u'white'),
 ('f141754', 269, u'game'),
 ('f258673', 253, u'look'),
 ('f246769', 248, u'know'),
 ('f60685', 231, u'black'),
 ('f148767', 219, u'good'),
 ('f113906', 216, u'eat'),
 ('f394947', 215, u'thing'),
 ('f300231', 215, u'onli'),
 ('f400097', 212, u'time'),
 ('f436656', 211, u'women'),
 ('f428393', 193, u'want'),
 ('f431153', 189, u'weight'),
 ('f318901', 178, u'post'),
 ('f361157', 169, u'shit'),
 ('f239627', 168, u'jew'),
 ('f342871', 164, u'right'),
 ('f285454', 147, u'need'),
 ('f419812', 144, u'use'),
 ('f433376', 143, u'whi'),
 ('f287913', 143, u'nigger'),
 ('f39551', 142, u'ani'),
 ('f334728', 141, u'realli'),
 ('f162484', 136, u'guy'),
 ('f429915', 135, 

In [46]:
yprob

array([[  1.50739864e-01,   8.38145196e-01,   8.80795345e-03,
          2.14899029e-03,   1.58049195e-04]], dtype=float32)

In [12]:
len(vect.vocabulary_)

455756

'SizeHate'

In [13]:
from IPython.display import HTML

HTML(
        """
    <style>
th {
background-color:#55FF33;
\}
td {
background-color:#00FFFF;
\}
</style>
<table><tr><th>bar</th><th>bar</th></tr><tr><td>foo</td><td>foo</td></tr></table>
    """ 
)

bar,bar.1
foo,foo
