In [55]:
import pandas as pd
import numpy as np

# In this file we make a bag-of-words representation of the clean_text

In [56]:
df = pd.read_csv("./clean_data/clean_tweets.csv", header=0)

In [57]:
df['clean_text']

0        trump international tower chicago ranked th ta...
1                     wishing happy bountiful thanksgiving
2        donald trump partners tv new reality series en...
3        hear donald trump discuss big gov spending ban...
4        watch video ivanka trump sharing business advi...
5        read donald trump say daughter ivanka upcoming...
6        lot people imagination execute execute imagina...
7                   read donald trump top ten tips success
8        hysterical dsrl videos featuring donald trump ...
9        donald trump bids buy oreo double stuf racing ...
10       reminder miss universe competition live bahama...
11       watch miss universe competition live bahamas s...
12       watch donald trump recent appearance late show...
13       ivanka twitter follow @ivankatrump terrific we...
14       browse donald trump summer reading list busine...
15       check list donald trump books summer reading t...
16       congrats winners around world entered think li.

In [59]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(
            [s.split() for s in list(df['clean_text'])], 
            workers=num_workers, size=num_features, \
            min_count = min_word_count, \
            window = context, sample = downsampling)

# calling init_sims make the training more efficient if we don't plan on
# training the model any further
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "trump2vec.bin"
model.save(model_name)

Training model...


2017-03-21 06:32:40,075 : INFO : collecting all words and their counts
2017-03-21 06:32:40,088 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-21 06:32:40,114 : INFO : PROGRESS: at sentence #10000, processed 91795 words, keeping 14197 word types
2017-03-21 06:32:40,144 : INFO : PROGRESS: at sentence #20000, processed 188646 words, keeping 23890 word types
2017-03-21 06:32:40,165 : INFO : collected 28314 word types from a corpus of 254373 raw words and 26745 sentences
2017-03-21 06:32:40,167 : INFO : Loading a fresh vocabulary
2017-03-21 06:32:40,198 : INFO : min_count=10 retains 3220 unique words (11% of original 28314, drops 25094)
2017-03-21 06:32:40,200 : INFO : min_count=10 leaves 207305 word corpus (81% of original 254373, drops 47068)
2017-03-21 06:32:40,258 : INFO : deleting the raw counts dictionary of 28314 items
2017-03-21 06:32:40,260 : INFO : sample=0.001 downsamples 43 most-common words
2017-03-21 06:32:40,262 : INFO : downsampling leaves

In [60]:
# Cool! Now the model is trained...let's have some fun
model.most_similar("hillary")

[('clinton', 0.9656050801277161),
 ('obama', 0.9429252743721008),
 ('china', 0.938691258430481),
 ('bad', 0.9173334836959839),
 ('@barackobama', 0.9101584553718567),
 ('money', 0.886329174041748),
 ('obamacare', 0.8773565888404846),
 ('must', 0.872538685798645),
 ('said', 0.8722615838050842),
 ('crooked', 0.8615908622741699)]

In [61]:
model.most_similar("sad")

[('replace', 0.9992004036903381),
 ('gave', 0.9988620281219482),
 ('lost', 0.998856246471405),
 ('ago', 0.9988492727279663),
 ('high', 0.9983710050582886),
 ('dumb', 0.9980428218841553),
 ('delay', 0.997380256652832),
 ('@nytimes', 0.9966719150543213),
 ('taken', 0.9966075420379639),
 ('dishonest', 0.9964410066604614)]

In [62]:
model.most_similar("jobs")

[('must', 0.986316978931427),
 ('us', 0.9856156706809998),
 ('stop', 0.981418251991272),
 ('never', 0.9735686779022217),
 ('wants', 0.9718760848045349),
 ('deal', 0.9709449410438538),
 ('china', 0.9703025221824646),
 ('u', 0.9645163416862488),
 ('obama', 0.962763786315918),
 ('take', 0.9567989706993103)]

In [58]:
df[df.isnull().any(axis=1)]

Unnamed: 0,truncated,text,is_quote_status,id,favorite_count,source,retweeted,retweet_count,favorited,lang,created_at,clean_text
