In [55]:
import pandas as pd
import numpy as np

# In this file we make a bag-of-words representation of the clean_text

In [56]:
df = pd.read_csv("./clean_data/clean_tweets.csv", header=0)

In [57]:
df['clean_text']

0        trump international tower chicago ranked th ta...
1                     wishing happy bountiful thanksgiving
2        donald trump partners tv new reality series en...
3        hear donald trump discuss big gov spending ban...
4        watch video ivanka trump sharing business advi...
5        read donald trump say daughter ivanka upcoming...
6        lot people imagination execute execute imagina...
7                   read donald trump top ten tips success
8        hysterical dsrl videos featuring donald trump ...
9        donald trump bids buy oreo double stuf racing ...
10       reminder miss universe competition live bahama...
11       watch miss universe competition live bahamas s...
12       watch donald trump recent appearance late show...
13       ivanka twitter follow @ivankatrump terrific we...
14       browse donald trump summer reading list busine...
15       check list donald trump books summer reading t...
16       congrats winners around world entered think li.

In [32]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(
            [s.split() for s in list(df['clean_text'])], 
            workers=num_workers, size=num_features, \
            min_count = min_word_count, \
            window = context, sample = downsampling)

# calling init_sims make the training more efficient if we don't plan on
# training the model any further
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "trump2vec.bin"
model.save(model_name)

2017-03-21 05:44:20,238 : INFO : collecting all words and their counts
2017-03-21 05:44:20,240 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-21 05:44:20,249 : INFO : collected 5196 word types from a corpus of 27301 raw words and 2795 sentences
2017-03-21 05:44:20,251 : INFO : Loading a fresh vocabulary
2017-03-21 05:44:20,261 : INFO : min_count=10 retains 562 unique words (10% of original 5196, drops 4634)
2017-03-21 05:44:20,263 : INFO : min_count=10 leaves 17339 word corpus (63% of original 27301, drops 9962)
2017-03-21 05:44:20,267 : INFO : deleting the raw counts dictionary of 5196 items
2017-03-21 05:44:20,269 : INFO : sample=0.001 downsamples 85 most-common words
2017-03-21 05:44:20,270 : INFO : downsampling leaves estimated 13970 word corpus (80.6% of prior 17339)
2017-03-21 05:44:20,272 : INFO : estimated required memory for 562 words and 300 dimensions: 1629800 bytes
2017-03-21 05:44:20,275 : INFO : resetting layer weights
2017-03-21 05:44:

Training model...


2017-03-21 05:44:20,437 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-03-21 05:44:20,438 : INFO : training on 136505 raw words (69823 effective words) took 0.1s, 486677 effective words/s
2017-03-21 05:44:20,443 : INFO : precomputing L2-norms of word weight vectors
2017-03-21 05:44:20,453 : INFO : saving Word2Vec object under trump2vec.bin, separately None
2017-03-21 05:44:20,457 : INFO : not storing attribute syn0norm
2017-03-21 05:44:20,460 : INFO : not storing attribute cum_table
2017-03-21 05:44:20,473 : INFO : saved trump2vec.bin


In [33]:
# Cool! Now the model is trained...let's have some fun
model.most_similar("hillary")

[('crooked', 0.9999712705612183),
 ('u', 0.9999687075614929),
 ('bad', 0.9999686479568481),
 ('people', 0.9999662041664124),
 ('one', 0.999965488910675),
 ('president', 0.9999635815620422),
 ('never', 0.9999634027481079),
 ('many', 0.999963104724884),
 ('would', 0.9999619722366333),
 ('big', 0.9999618530273438)]

In [34]:
model.most_similar("sad")

[('like', 0.9999418258666992),
 ('u', 0.9999414682388306),
 ('one', 0.9999412298202515),
 ('many', 0.9999408721923828),
 ('last', 0.999938428401947),
 ('obama', 0.9999377131462097),
 ('people', 0.9999376535415649),
 ('bad', 0.9999370574951172),
 ('must', 0.9999368786811829),
 ('crooked', 0.9999368190765381)]

In [35]:
model.most_similar("jobs")

[('u', 0.9999639391899109),
 ('big', 0.999962568283081),
 ('many', 0.9999623894691467),
 ('great', 0.9999606013298035),
 ('people', 0.9999601244926453),
 ('trump', 0.9999594688415527),
 ('crooked', 0.9999594688415527),
 ('one', 0.9999594688415527),
 ('media', 0.9999585151672363),
 ('much', 0.999958336353302)]

In [54]:
df[df.isnull().any(axis=1)]

Unnamed: 0,truncated,text,is_quote_status,id,favorite_count,source,retweeted,retweet_count,favorited,lang,created_at,clean_text
16553,False,https://t.co/T5JBFXOz3F,False,669980142475845632,4481,iphone,False,1989,False,und,Thu Nov 26 20:44:48 +0000 2015,
18901,False,http://t.co/PtViAyrO4A,False,619646907468632064,4345,iphone,False,3027,False,und,Fri Jul 10 23:18:29 +0000 2015,
19450,False,There is. https://t.co/nCOUYoClDN,True,609388571951239168,47,web,False,17,False,en,Fri Jun 12 15:55:32 +0000 2015,
25306,False,https://t.co/ZQ0osiFEJQ,False,708351381678088192,10424,iphone,False,4923,False,und,Fri Mar 11 17:58:24 +0000 2016,
26004,False,https://t.co/SmTkLPiBYD,False,692171744845664258,18204,iphone,False,8278,False,und,Wed Jan 27 02:26:18 +0000 2016,
