### load libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
import nltk
from collections import Counter
import itertools
from nltk.corpus import stopwords

import os.path
FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
# FILE_PATH = '/home/sam/Data/twitter_sentiment/'

In [2]:
# def function to load data from json to dataframe
def load_data(file_name):
    print "Loading: " + file_name + " ..."
    data_path = FILE_PATH + 'data/'
    data_df = pd.read_json(data_path + file_name, lines=True)
    # we only take the 'text' column
    drop_columns = list(data_df.columns)
    drop_columns.remove('text')
    data_df.drop(drop_columns, axis = 1, inplace = True)
    print "Done loading json file to dataframe."
    return data_df

In [3]:
df_pos = load_data('positive.json')
df_pos.dropna(axis=0, inplace=True) # drop na rows
print df_pos.head()
df_neg = load_data('negative.json')
df_neg.dropna(axis=0, inplace=True) # drop na rows
print df_neg.head()

Loading: positive.json ...
Done loading json file to dataframe.
                                                text
0          Thanks fam!!! 😘😘😘 https://t.co/sbNzT886Vs
1  RT @PinGDP: Right back at you Ms Cassie :)\n\n...
2  @notonIyou also my kinda coming out as bi :) #...
3  RT @kevingschmidt: Thank you ktla5news for hav...
4  RT @watchdogsgame: Celebrate our DedSec member...
Loading: negative.json ...
Done loading json file to dataframe.
                                                text
0                 I loved it https://t.co/nhezqTqCcc
1                     I want to get my nails done :(
2  RT @BucamanWWE: This is the last were seeing t...
3  @TheDauntingFray // Lucky I've never done that...
4                 @JonahBonahh I love her so much :(


### pre-processing text

In [4]:
def pre_process(df):
    # remove new line char
    df['text'].replace(regex=True,inplace=True,to_replace=r'\\n',value=r'')
    # remove https links
    df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
    # remove user name
    df['text'].replace(regex=True,inplace=True,to_replace=r'@\w+',value=r'')
    # remove non-alphabet, this includes number and punctuation
    df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
    # tokenize each tweets to form sentences.
    df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['text'].lower()), axis=1)
    # remove stop words
    stop_words = stopwords.words('english')
    add_stop_words = ['amp', 'rt']
    stop_words += add_stop_words
    print "sample stopping words: ", stop_words[:5]
    df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
    print df.head(5)
#     return df
pre_process(df_pos)
pre_process(df_neg)

sample stopping words:  [u'i', u'me', u'my', u'myself', u'we']
                                                text  \
0                                       Thanks fam     
1         RT  Right back at you Ms Cassie \n\n         
2        also my kinda coming out as bi  issagaygirl   
3  RT  Thank you ktlanews for having  and I on to...   
4  RT  Celebrate our DedSec members on their birt...   

                                           tokenized  
0                                      [thanks, fam]  
1                          [right, back, ms, cassie]  
2             [also, kinda, coming, bi, issagaygirl]  
3                      [thank, ktlanews, talk, link]  
4  [celebrate, dedsec, members, birthdays, gtgt, ...  
sample stopping words:  [u'i', u'me', u'my', u'myself', u'we']
                                                text  \
0                                        I loved it    
1                       I want to get my nails done    
2  RT  This is the last were seeing the

### now let us bring in the wordvec trained using text8 dataset

In [5]:
# check if the model already exists, if so load it else train the model
def build_wordvec(sentences, model_name, size = 200):
    model_path = FILE_PATH + 'wordvec/' + model_name
    if os.path.isfile(model_path):
        print "Loading existing model {} ...".format(model_name)
        model = word2vec.Word2Vec.load(model_path)
    else:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        print "Training for {} ...".format(model_name)
        model = word2vec.Word2Vec(sentences, size=size, sg=1, workers=4)
        model.save(model_path)
        # If you’re finished training a model (=no more updates, only querying), you can do
        model.init_sims(replace=True)
    print "Done building."
    return model

sentences = word2vec.Text8Corpus(FILE_PATH + 'data/text8')              # use text 8
# sentences = list(df_pos['tokenized']) + list(df_neg['tokenized'])         # use just tweets itself
vec_size = 800
# model = load_wordvec(sentences, 'text8.model.bin', size = vec_size)
model = build_wordvec(sentences, 'tweets.model.bin', size = vec_size)

2017-03-06 21:18:55,166 : INFO : collecting all words and their counts
2017-03-06 21:18:55,168 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training for tweets.model.bin ...


2017-03-06 21:18:58,739 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-03-06 21:18:58,740 : INFO : Loading a fresh vocabulary
2017-03-06 21:18:59,019 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2017-03-06 21:18:59,019 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2017-03-06 21:18:59,137 : INFO : deleting the raw counts dictionary of 253854 items
2017-03-06 21:18:59,159 : INFO : sample=0.001 downsamples 38 most-common words
2017-03-06 21:18:59,160 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2017-03-06 21:18:59,161 : INFO : estimated required memory for 71290 words and 800 dimensions: 491901000 bytes
2017-03-06 21:18:59,311 : INFO : resetting layer weights
2017-03-06 21:19:00,208 : INFO : training model with 4 workers on 71290 vocabulary and 800 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2017-03-

Done building.


### transform our tweets using vector representation

In [6]:
# first find the max length since that decides the padding
def max_len(df):
    df['size'] = df['tokenized'].apply(lambda x: len(x))
    print "max sentence length is: ", df['size'].max()
    return df['size'].max()
max_total = max(max_len(df_pos), max_len(df_neg))

max sentence length is:  17
max sentence length is:  20


In [7]:
# initialize empty arry to fill with vector repsentation
def convert2vec(df, max_total):
    tweet_tokens = df['tokenized']
    n = tweet_tokens.shape[0]
    m = max_total
    n_absent = 0
    tweet_vecs = np.zeros((n,m,vec_size))
    vocabs = model.wv.vocab.keys()
    for i in range(n):
        token_i = [x for x in tweet_tokens[i] if x in vocabs]
        m_i = len(token_i)
        if m_i == 0:
            n_absent += 1
        else:
            diff_i = abs(m_i - m)
            vecs_i = model[token_i]
            tweet_vecs[i] = np.lib.pad(vecs_i, ((0,diff_i),(0,0)), 'constant', constant_values=0)
    print "Done converting tweets to vec!"
    print "Total {} not in vocab.".format(n_absent)
    return tweet_vecs


pos_vecs = convert2vec(df_pos, max_total)
neg_vecs = convert2vec(df_neg, max_total)

Done converting tweets to vec!
Total 71 not in vocab.
Done converting tweets to vec!
Total 81 not in vocab.


In [8]:
# save tweet_vecs to disk in npy
def save_vec(tweet_vecs, name):
    file_name = FILE_PATH + name
    if os.path.isfile(file_name + '.npy') and os.path.isfile(file_name + '.npz'):
        print "npy already exists."
    else:
        np.save(file_name, tweet_vecs)
        np.savez(file_name, tweet_vecs)
        print "Saved {} to disk.".format(name)
save_vec(pos_vecs, 'pos')
save_vec(neg_vecs, 'neg')

Saved pos to disk.
Saved neg to disk.
