### load libraries

In [13]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
import nltk
from collections import Counter
import itertools
from nltk.corpus import stopwords

import os.path
DATA_PATH = '/home/sam/Hhd/twitter_sentiment/'

In [14]:
# def function to load data from json to dataframe
def load_data(file_name):
    print "Loading: " + file_name + " ..."
    data_path = '/home/sam/Hhd/twitter_sentiment/data/'
    data_df = pd.read_json(data_path + file_name, lines=True)
    # we only take the 'text' column
    drop_columns = list(data_df.columns)
    drop_columns.remove('text')
    data_df.drop(drop_columns, axis = 1, inplace = True)
    print "Done loading json file to dataframe."
    return data_df

In [15]:
df = load_data('positive.json')
df.dropna(axis=0, inplace=True) # drop na rows
df.head()

Loading: positive.json ...
Done loading json file to dataframe.


Unnamed: 0,text
0,Thanks fam!!! 😘😘😘 https://t.co/sbNzT886Vs
1,RT @PinGDP: Right back at you Ms Cassie :)\n\n...
2,@notonIyou also my kinda coming out as bi :) #...
3,RT @kevingschmidt: Thank you ktla5news for hav...
4,RT @watchdogsgame: Celebrate our DedSec member...


### pre-processing text

In [16]:
# remove new line char
df['text'].replace(regex=True,inplace=True,to_replace=r'\\n',value=r'')
# remove https links
df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
# remove user name
df['text'].replace(regex=True,inplace=True,to_replace=r'@\w+',value=r'')
# remove non-alphabet, this includes number and punctuation
df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
# tokenize each tweets to form sentences.
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['text'].lower()), axis=1)
# remove stop words
stop_words = stopwords.words('english')
add_stop_words = ['amp', 'rt']
stop_words += add_stop_words
print "sample stopping words: ", stop_words[:5]
df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
df.head(10)

sample stopping words:  [u'i', u'me', u'my', u'myself', u'we']


Unnamed: 0,text,tokenized
0,Thanks fam,"[thanks, fam]"
1,RT Right back at you Ms Cassie \n\n,"[right, back, ms, cassie]"
2,also my kinda coming out as bi issagaygirl,"[also, kinda, coming, bi, issagaygirl]"
3,RT Thank you ktlanews for having and I on to...,"[thank, ktlanews, talk, link]"
4,RT Celebrate our DedSec members on their birt...,"[celebrate, dedsec, members, birthdays, gtgt, ..."
5,will i see the Jack Johnson followes you in my...,"[see, jack, johnson, followes, notifications, ..."
6,sexy live cam show in \nLets get naughty por...,"[sexy, live, cam, show, lets, get, naughty, po..."
7,people have decided that Sunbleach is cool en...,"[people, decided, sunbleach, cool, enough, fol..."
8,My Goddess Marilyn,"[goddess, marilyn]"
9,Also quite funny to see that outside our droni...,"[also, quite, funny, see, outside, dronies, wo..."


### now let us bring in the wordvec trained using text8 dataset

In [28]:
# check if the model already exists, if so load it else train the model
def load_wordvec(sentences, model_name, size = 50):
    data_path = '/home/sam/Hhd/twitter_sentiment/'
    model_path = data_path + 'wordvec/' + model_name
    if os.path.isfile(model_path):
        print "Loading existing model {} ...".format(model_name)
        model = word2vec.Word2Vec.load(model_path)
    else:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        print "Training using {} ...".format(model_name)
        model = word2vec.Word2Vec(sentences, size=size, sg=1, workers=4)
        model.save(model_path)
        # If you’re finished training a model (=no more updates, only querying), you can do
        model.init_sims(replace=True)
    print "Done loading."
    return model

# sentences = word2vec.Text8Corpus(data_path + 'data/text8')
sentences = list(df['tokenized'])
vec_size = 50
# model = load_wordvec(sentences, 'text8.model.bin')
model = load_wordvec(sentences, 'tweets.model.bin')

2017-03-04 11:14:10,126 : INFO : loading Word2Vec object from /home/sam/Hhd/twitter_sentiment/wordvec/tweets.model.bin
2017-03-04 11:14:10,128 : INFO : loading wv recursively from /home/sam/Hhd/twitter_sentiment/wordvec/tweets.model.bin.wv.* with mmap=None
2017-03-04 11:14:10,128 : INFO : setting ignored attribute syn0norm to None
2017-03-04 11:14:10,128 : INFO : setting ignored attribute cum_table to None
2017-03-04 11:14:10,129 : INFO : loaded /home/sam/Hhd/twitter_sentiment/wordvec/tweets.model.bin


Loading existing model tweets.model.bin ...
Done loading.


### transform our tweets using vector representation

In [18]:
# first find the max length since that decides the padding
df['size'] = df['tokenized'].apply(lambda x: len(x))
print "max sentence length is: ", df['size'].max()

max sentence length is:  17


In [48]:
# initialize empty arry to fill with vector repsentation
n = tweet_tokens.shape[0]
m = df['size'].max()
n_absent = 0
tweet_vecs = np.zeros((n,m,vec_size))
vocabs = model.wv.vocab.keys()
for i in range(n):
    token_i = [x for x in tweet_tokens[i] if x in vocabs]
    m_i = len(token_i)
    if m_i == 0:
        n_absent += 1
    else:
        diff_i = abs(m_i - m)
        vecs_i = model[token_i]
        tweet_vecs[i] = np.lib.pad(vecs_i, ((0,diff_i),(0,0)), 'constant', constant_values=0)
print "Done converting tweets to vec!"
print "Total {} not in vocab.".format(n_absent)

Done converting tweets to vec!
Total 124 not in vocab.
