In [1]:
import pandas as pd
import numpy as np
from six.moves import urllib
import tensorflow as tf
import word2vec
import re
import tempfile

In [5]:
# train word2vec with sample word list dataset
# this might take a minute
vecfile = '/home/claire/Documents/jobs/job-hunting/massively/text8'
word2vec.word2vec(vecfile, '/home/claire/Documents/jobs/job-hunting/massively/text8.bin', size=100, verbose=True)
model = word2vec.load('/home/claire/Documents/jobs/job-hunting/massively/text8.bin')

Starting training using file /home/claire/Documents/jobs/job-hunting/massively/text8


KeyboardInterrupt: 

In [2]:
# load the vector model
model = word2vec.load('/home/claire/Documents/jobs/job-hunting/massively/text8.bin')

In [3]:
# import the sentiment data
df = pd.DataFrame.from_csv('/home/claire/Documents/jobs/job-hunting/massively/sentiment_analysis/dfe_happysad_utf.csv', header=0, sep=',', index_col=0)
df

Unnamed: 0_level_0,features,label
id_nfpu,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967666,Layin n bed with a headache ughhhh...waitin o...,sadness
1956967696,Funeral ceremony...gloomy friday...,sadness
1956968487,"I should be sleep, but im not! thinking about ...",sadness
1956969035,@charviray Charlene my love. I miss you,sadness
1956969172,@kelcouch I'm sorry at least it's Friday?,sadness
1956970047,Ugh! I have to beat this stupid song to get to...,sadness
1956970424,@BrodyJenner if u watch the hills in london u ...,sadness
1956971077,The storm is here and the electricity is gone,sadness
1956971206,So sleepy again and it's not even that late. I...,sadness
1956971586,How are YOU convinced that I have always wante...,sadness


In [4]:
# split the data into test and train
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.7

df_train = df[msk]
df_test = df[~msk]

In [79]:
# split training and testing tweets by sadnesss and happiness
sad_train = df_train.loc[df_train['label'] == 'sadness']
happy_train = (df_train.loc[df_train['label'] == 'happiness']).head(3612)

sad_test = df_test.loc[df_test['label'] == 'sadness']
happy_test = (df_test.loc[df_test['label'] == 'happiness']).head(1543)

In [80]:
# create numpy arrays
np_sad_train = sad_train.as_matrix(columns=df.columns[0:2])
np_happy_train = happy_train.as_matrix(columns=df.columns[0:2])

np_sad_test = sad_test.as_matrix(columns=df.columns[0:2])
np_happy_test = happy_test.as_matrix(columns=df.columns[0:2])

print(len(np_sad_test), len(np_happy_test))
print(len(np_sad_train), len(np_happy_train))

1543 1543
3612 3612


In [68]:
# set maximum sentence length and number of dimension for each word vector
maxSeqLength = df_train['features'].str.len().max()
numFiles = 10362

In [69]:
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

def newline(string):
    newline_pattern = re.compile("[^\n]*\n")
    return re.findall(newline_pattern, string)

In [70]:
class indexedArray(np.ndarray):
    def __new__(cls, *args, **kwargs):
        return np.array(*args, **kwargs).view(indexedArray)
    def index(self, value):
        return np.where(self==value)

In [71]:
def vectorize(model):
    
    ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
    vectors = indexedArray(model.vectors)
    fileCounter = 0
    
    for tweet in np_sad_train:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                ids[fileCounter][indexCounter] = tuples[0]
                print(ids[fileCounter][indexCounter])
            except KeyError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(ids[fileCounter][indexCounter])
            indexCounter = indexCounter + 1
            if indexCounter >= maxSeqLength:
                break
        fileCounter = fileCounter + 1
        if fileCounter == 10:
            break
            
    for tweet in np_happy_train:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                ids[fileCounter][indexCounter] = tuples[0]
                print(ids[fileCounter][indexCounter])
            except KeyError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(ids[fileCounter][indexCounter])
        indexCounter = indexCounter + 1
        if indexCounter >= maxSeqLength:
            break
        fileCounter = fileCounter + 1
        if fileCounter == 20:
            break
    return ids

In [72]:
ids = vectorize(model)

['Layin n bed with a headache  ughhhh...waitin on your call...' 'sadness']
399999
157
4916
24
6
16030
399999
25
772
969
[ "I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous! "
 'sadness']
72
280
32
4488
42
7766
38
2854
79
31
246
1620
57
72
2151
42
53181
882
163
23280
11826
36
6286
746
399999
21936
['@charviray Charlene my love. I miss you ' 'sadness']
399999
399999
609
611
72
5148
207
['Ugh! I have to beat this stupid song to get to the next  rude!' 'sadness']
399999
72
39
7
3251
33
13147
584
7
689
7
1
479
16545
['The storm is here and the electricity is gone ' 'sadness']
1
3828
11
622
3
1
2056
11
3113
[ 'How are YOU convinced that I have always wanted you? What signals did I give off...damn I think I just lost another friend '
 'sadness']
311
26
207
4767
20
72
39
522
1783
207
154
2943
216
72
791
399999
72
1811
72
325
543
167
1620
[ "@IsaacMascote  i'm sorry people are so rude to you, isaac, they should g

In [73]:
np.save('idsMatrix', ids)
ids = np.load('idsMatrix.npy')

In [74]:
train_arrays = ids
train_labels = np.zeros(numFiles)

# create a numpy array of 0s and 1s for neg and pos

for i in range(len(np_sad_train)):
    train_labels[i] = 0
    train_labels[len(np_happy_train) + i] = 1

3612 3612
7224
14548


In [None]:
test_arrays = ids
test_labels = np.zeros(numFiles)

# create a numpy array of 0s and 1s for neg and pos

for i in range(len(np_sad_test)):
    train_labels[i] = 0
    train_labels[len(np_happy_test) + i] = 1

In [75]:
from sklearn.linear_model import LogisticRegression