In [1]:
import cPickle
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
with open('BOTCOMMENTS2.json') as f:
    bot_data = json.loads(f.readlines()[0])

with open('BigBotComments.json') as f:
    big_bot_data = json.loads(f.readlines()[0])
    
with open('user_results.json') as infile:
    result_dict = json.load(infile)

In [7]:
bot_names = bot_data.keys()
string_comments = {}
reply_comments = {}
for bot in bot_names:
    comments = bot_data[bot]
    comment = ""
    replies = ""
    for c in comments:
        comment += c['body']
        for reply in c['replies']:
            replies += reply
            
    string_comments[bot] = comment
    reply_comments[bot] = replies
bot_data = string_comments


In [14]:
n_feats = 2000
doc_by_vocab = np.empty([len(bot_data), n_feats])


def build_vectorizer(max_features, stop_words, norm='l2'):
    return TfidfVectorizer(max_features = max_features, 
                                stop_words = stop_words,
                                max_df = 0.9,
                                min_df = 1,
                                norm = norm)

tfidf_vec = build_vectorizer(n_feats, "english")
doc_by_vocab = tfidf_vec.fit_transform([bot_data[d] for d in bot_data.keys()]).toarray()
bot_replies = tfidf_vec.fit_transform([reply_comments[d] for d in bot_data.keys()]).toarray()

In [41]:
from scipy.sparse.linalg import svds
matrix = doc_by_vocab.transpose()
vocabu, s, docv_trans = svds(matrix, k=100)
docs_compressed = docv_trans.transpose()
print(vocabu.shape)
print(docs_compressed.shape)

(2000, 100)
(3725, 100)


In [42]:
word_to_index = tfidf_vec.vocabulary_
index_to_word = {i:t for t,i in word_to_index.iteritems()}
from sklearn.preprocessing import normalize
vocabu = normalize(vocabu, axis = 1)


In [43]:
def closest_words(word_in, k = 10):
    if word_in not in word_to_index: return "Not in vocab."
    sims = vocabu.dot(vocabu[word_to_index[word_in],:])
    asort = np.argsort(-sims)[:k+1]
    print(asort)
    return [(index_to_word[i],sims[i]/sims[asort[0]]) for i in asort[1:]]

In [46]:
print(closest_words("politics"))

[1316 1765 1011 1564  725 1792  395  809  685  648  719]
[(u'term', 0.5070852945389837), (u'limit', 0.5001757559409654), (u'shit', 0.49980998394230997), (u'future', 0.4917336473589449), (u'til', 0.4852259522462252), (u'considering', 0.48450584217379755), (u'hax', 0.4833855272801485), (u'fixed', 0.4814028698287729), (u'father', 0.47814581662697087), (u'fucking', 0.47812225483047166)]


In [15]:
cPickle.dump( bot_data, open( "bot_data.p", "wb" ) )
cPickle.dump( bot_replies, open( "bot_replies.p", "wb" ) )
cPickle.dump( big_bot_data, open( "big_bot_data.p", "wb" ) )
cPickle.dump( result_dict, open( "user_results.p", "wb" ) )
cPickle.dump( doc_by_vocab, open( "doc_by_vocab.p", "wb") )
cPickle.dump( tfidf_vec, open( "vectorizer.p", "wb") )