### load libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
import nltk
from collections import Counter
import itertools
from nltk.corpus import stopwords

import os.path

In [2]:
# def function to load data from json to dataframe
def load_data(file_name):
    print "Loading: " + file_name + " ..."
    data_path = './data/'
    data_df = pd.read_json(data_path + file_name, lines=True)
    # we only take the 'text' column
    drop_columns = list(data_df.columns)
    drop_columns.remove('text')
    data_df.drop(drop_columns, axis = 1, inplace = True)
    print "Done loading json file to dataframe."
    return data_df

In [3]:
df = load_data('positive.json')
df.dropna(axis=0, inplace=True) # drop na rows
df.head()

Loading: positive.json ...
Done loading json file to dataframe.


Unnamed: 0,text
0,Thanks fam!!! 😘😘😘 https://t.co/sbNzT886Vs
1,RT @PinGDP: Right back at you Ms Cassie :)\n\n...
2,@notonIyou also my kinda coming out as bi :) #...
3,RT @kevingschmidt: Thank you ktla5news for hav...
4,RT @watchdogsgame: Celebrate our DedSec member...


### pre-processing text

In [4]:
# remove new line char
df['text'].replace(regex=True,inplace=True,to_replace=r'\\n',value=r'')
# remove https links
df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
# remove user name
df['text'].replace(regex=True,inplace=True,to_replace=r'@\w+',value=r'')
# remove non-alphabet, this includes number and punctuation
df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
# tokenize each tweets to form sentences.
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['text'].lower()), axis=1)
# remove stop words
stop_words = stopwords.words('english')
add_stop_words = ['amp', 'rt']
stop_words += add_stop_words
print "sample stopping words: ", stop_words[:5]
df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
df.head()

sample stopping words:  [u'i', u'me', u'my', u'myself', u'we']


Unnamed: 0,text,tokenized
0,Thanks fam,"[thanks, fam]"
1,RT Right back at you Ms Cassie \n\n,"[right, back, ms, cassie]"
2,also my kinda coming out as bi issagaygirl,"[also, kinda, coming, bi, issagaygirl]"
3,RT Thank you ktlanews for having and I on to...,"[thank, ktlanews, talk, link]"
4,RT Celebrate our DedSec members on their birt...,"[celebrate, dedsec, members, birthdays, gtgt, ..."


### now let us bring in the wordvec trained using text8 dataset

In [6]:
# check if the model already exists, if so load it else train the model
def load_wordvec():
    if os.path.isfile('./wordvec/text.model.bin'):
        print "Loading existing model ..."
        model = word2vec.Word2Vec.load('./wordvec/text.model.bin')
    else:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        sentences = word2vec.Text8Corpus('./data/text8')
        print "Training using Text8 ..."
        model = word2vec.Word2Vec(sentences, size=500, workers=4)
        model.save('text.model.bin')
        # If you’re finished training a model (=no more updates, only querying), you can do
        model.init_sims(replace=True)
    print "Done loading."
    return model
model = load_wordvec()

2017-03-02 21:21:23,331 : INFO : collecting all words and their counts
2017-03-02 21:21:23,334 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training using Text8 ...


2017-03-02 21:21:28,711 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-03-02 21:21:28,711 : INFO : Loading a fresh vocabulary
2017-03-02 21:21:29,090 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2017-03-02 21:21:29,090 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2017-03-02 21:21:29,259 : INFO : deleting the raw counts dictionary of 253854 items
2017-03-02 21:21:29,279 : INFO : sample=0.001 downsamples 38 most-common words
2017-03-02 21:21:29,280 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2017-03-02 21:21:29,280 : INFO : estimated required memory for 71290 words and 500 dimensions: 320805000 bytes
2017-03-02 21:21:29,528 : INFO : resetting layer weights
2017-03-02 21:21:30,594 : INFO : training model with 4 workers on 71290 vocabulary and 500 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-03-

In [7]:
# a quick look at the model
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[(u'queen', 0.625677227973938)]