## Predictive Modeling

In [73]:
'''Run a prediction for a comment through a reddit hate speech model'''
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import xgboost as xgb
import cPickle as pickle
import numpy as np
import pandas as pd
stemmer = snowball.SnowballStemmer("english")

def stem_tokens(tokens, stemmer):
    '''Stem the tokens.'''
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    '''Tokenize & stem. Stems automatically for now.
    Leaving "stemmer" out of function call, so it works with TfidfVectorizer'''
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

def predict_comment(comment, classes, bst, vect):
    '''
    Where "comment" is the comment by the user, to be passed in.
    classes =
    '''
    comment_tfidf = vect.transform([comment])
    comment_xgb = xgb.DMatrix(comment_tfidf)
    yprob = bst.predict(comment_xgb).reshape(1, 5)  # hard coding -- only one comment at a time in this case.
    ylabel = classes[np.argmax(yprob, axis=1)]

    print('The class is: {0} with probability {1}%'.format(ylabel, round(100 * np.max(yprob), 1)))


In [74]:
classes = ['Not Hate', 'Size Hate', 'Gender Hate', 'Race Hate', 'Religion Hate']

# load saved xgboost model
bst = xgb.Booster()
bst.load_model('../data/hatepredictor_pyladies.model')
# load tf-idf matrix
vect = pickle.load(open('../data/vect_pyladies.p', 'rb'))

In [77]:
# get comment from user
comment = raw_input('Enter comment: ')
# predict class of comment
predict_comment(comment, classes, bst, vect)

Enter comment: 
The class is: Not Hate with probability 69.1%


## Word2Vec


In [1]:
import gensim

Couldn't import dot_parser, loading of dot files will not be possible.


In [6]:
HateModel = gensim.models.Word2Vec.load('../data/w2vHateModel.model')
NotHateModel = gensim.models.Word2Vec.load('../Data/w2vNotHateModel.model')

In [68]:
def print_most_sim(words, HateModel, NotHateModel):
    '''words is a list of words'''
    for word in words:
        nothatelist = NotHateModel.most_similar(word)
        hatelist = HateModel.most_similar(word)
        print ' '
        print "Not Hate           word='{0}'               Hate".format(word)
        print "---------------------------------------------------"
        for nh, h in zip(nothatelist, hatelist):
            print "{0}\t\t{1}".format(nh, h)
    

In [71]:
words = ['fat', 'dress', 'pretty', 'suck']

In [72]:
print_most_sim(words, HateModel, NotHateModel)

 
Not Hate           word='dress'               Hate
---------------------------------------------------
(u'steak', 0.5374936461448669)		(u'soldier', 0.6863648891448975)
(u'wear', 0.5334213376045227)		(u'retard', 0.664968729019165)
(u'shop', 0.4940866231918335)		(u'gut', 0.654162585735321)
(u'taste', 0.4924619197845459)		(u'bowling', 0.651995062828064)
(u'calculus', 0.49234992265701294)		(u'linebacker', 0.6515874862670898)
(u'coffee', 0.489260196685791)		(u'slut', 0.6515154838562012)
(u'makeup', 0.47544825077056885)		(u'toad', 0.6447991728782654)
(u'balloon', 0.47303318977355957)		(u'fold', 0.6412398815155029)
(u'referral', 0.4679303467273712)		(u'rock', 0.6399352550506592)
(u'counselor', 0.46694833040237427)		(u'bloody', 0.6364386081695557)
 
Not Hate           word='fat'               Hate
---------------------------------------------------
(u'overweight', 0.6250482797622681)		(u'skinny', 0.6153160333633423)
(u'healthy', 0.6066110730171204)		(u'thin', 0.6055712699890137)
(u'obese', 0