In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import word2vec
import re
from sklearn.linear_model import LogisticRegression

In [2]:
# train word2vec with sample word list dataset
# uncomment this to run training, takes a minute or so

# vecfile = '/home/claire/Documents/jobs/job-hunting/massively/text8'
# word2vec.word2vec(vecfile, '/home/claire/Documents/jobs/job-hunting/massively/text8.bin', size=100, verbose=True)

In [3]:
# load the vector model
model = word2vec.load('/home/claire/Documents/jobs/job-hunting/massively/text8.bin')

In [4]:
# import the sentiment data
df = pd.DataFrame.from_csv('/home/claire/Documents/jobs/job-hunting/massively/sentiment_analysis/dfe_happysad_utf.csv', header=0, sep=',', index_col=0)
df

Unnamed: 0_level_0,features,label
id_nfpu,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967666,Layin n bed with a headache ughhhh...waitin o...,sadness
1956967696,Funeral ceremony...gloomy friday...,sadness
1956968487,"I should be sleep, but im not! thinking about ...",sadness
1956969035,@charviray Charlene my love. I miss you,sadness
1956969172,@kelcouch I'm sorry at least it's Friday?,sadness
1956970047,Ugh! I have to beat this stupid song to get to...,sadness
1956970424,@BrodyJenner if u watch the hills in london u ...,sadness
1956971077,The storm is here and the electricity is gone,sadness
1956971206,So sleepy again and it's not even that late. I...,sadness
1956971586,How are YOU convinced that I have always wante...,sadness


In [5]:
# split the data into test and train
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.7

df_train = df[msk]
df_test = df[~msk]

In [6]:
# split training and testing tweets by sadnesss and happiness
sad_train = df_train.loc[df_train['label'] == 'sadness']
happy_train = (df_train.loc[df_train['label'] == 'happiness'])

sad_test = df_test.loc[df_test['label'] == 'sadness']
happy_test = (df_test.loc[df_test['label'] == 'happiness'])

print(sad_train['features'])
print(sad_test['features'])

id_nfpu
1956967666    Layin n bed with a headache  ughhhh...waitin o...
1956967696                 Funeral ceremony...gloomy friday... 
1956969035             @charviray Charlene my love. I miss you 
1956969172           @kelcouch I'm sorry  at least it's Friday?
1956970047    Ugh! I have to beat this stupid song to get to...
1956970424    @BrodyJenner if u watch the hills in london u ...
1956971077       The storm is here and the electricity is gone 
1956971586    How are YOU convinced that I have always wante...
1956972359    so tired and i think i'm definitely going to g...
1956972557    @IsaacMascote  i'm sorry people are so rude to...
1956973598    Fudge.... Just BS'd that whole paper.... So ti...
1956976312    @ether_radio yeah :S i feel all funny cause i ...
1956982383    @maternitytees Aww  Onward and upwards now, ya...
1956982576    diesel yaris... 70mpg  so sad its not availabl...
1956982605    I want to buy this great album but unfortunate...
1956983171    Ok ... the passeng

In [7]:
# create numpy arrays
np_sad_train = sad_train.as_matrix(columns=df.columns[0:2])
np_happy_train = happy_train.as_matrix(columns=df.columns[0:2])

np_sad_test = sad_test.as_matrix(columns=df.columns[0:2])
np_happy_test = happy_test.as_matrix(columns=df.columns[0:2])

print(len(np_sad_test), len(np_happy_test))
print(len(np_sad_train), len(np_happy_train))

print(np_sad_train[0], np_sad_test[0])

1530 1532
3625 3675
['Layin n bed with a headache  ughhhh...waitin on your call...' 'sadness'] [ "I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous! "
 'sadness']


In [8]:
# set maximum sentence length and number of dimension for each word vector
maxSeqLength = df_train['features'].str.len().max()
numFiles = 10362

In [9]:
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

def newline(string):
    newline_pattern = re.compile("[^\n]*\n")
    return re.findall(newline_pattern, string)

In [10]:
class indexedArray(np.ndarray):
    def __new__(cls, *args, **kwargs):
        return np.array(*args, **kwargs).view(indexedArray)
    def index(self, value):
        return np.where(self==value)

In [11]:
def vectorizeTrain(model):
    
    ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
    vectors = indexedArray(model.vectors)
    fileCounter = 0
    
    for tweet in np_sad_train:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                ids[fileCounter][indexCounter] = tuples[0]
                print(ids[fileCounter][indexCounter])
            except KeyError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(ids[fileCounter][indexCounter])
            indexCounter = indexCounter + 1
            if indexCounter >= maxSeqLength:
                break
        fileCounter = fileCounter + 1
        if fileCounter == 10:
            break
            
    for tweet in np_happy_train:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                ids[fileCounter][indexCounter] = tuples[0]
                print(ids[fileCounter][indexCounter])
            except KeyError:
                ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(ids[fileCounter][indexCounter])
        indexCounter = indexCounter + 1
        if indexCounter >= maxSeqLength:
            break
        fileCounter = fileCounter + 1
        if fileCounter == 20:
            break
    return ids

In [12]:
ids = vectorizeTrain(model)
np.save('idsMatrix', ids)
ids = np.load('idsMatrix.npy')

['Layin n bed with a headache  ughhhh...waitin on your call...' 'sadness']
399999
157
4916
24
6
16030
399999
25
772
969
['Funeral ceremony...gloomy friday... ' 'sadness']
4131
399999
6460
['@charviray Charlene my love. I miss you ' 'sadness']
399999
399999
609
611
72
5148
207
["@kelcouch I'm sorry  at least it's Friday?" 'sadness']
399999
7766
14040
35
516
45
6460
['Ugh! I have to beat this stupid song to get to the next  rude!' 'sadness']
399999
72
39
7
3251
33
13147
584
7
689
7
1
479
16545
[ '@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late  i just watch itonlinelol'
 'sadness']
399999
88
146
3433
1
3112
5
339
146
97
22201
154
399999
27
11
125
40
1997
3
1997
329
72
325
3433
399999
['The storm is here and the electricity is gone ' 'sadness']
1
3828
11
622
3
1
2056
11
3113
[ 'How are YOU convinced that I have always wanted you? What signals did I give off...damn I think I just lost another friend '
 'sadness']
311
26
207
4

In [13]:
def vectorizeTest(model):
    
    test_ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
    vectors = indexedArray(model.vectors)
    fileCounter = 0
    
    for tweet in np_sad_test:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                test_ids[fileCounter][indexCounter] = tuples[0]
                print(test_ids[fileCounter][indexCounter])
            except KeyError:
                test_ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(test_ids[fileCounter][indexCounter])
            indexCounter = indexCounter + 1
            if indexCounter >= maxSeqLength:
                break
        fileCounter = fileCounter + 1
        if fileCounter == 10:
            break
            
    for tweet in np_happy_test:
        print(tweet)
        indexCounter = 0
        cleaned = cleanSentences(tweet[0])
        split = cleaned.split()
        for word in split:
            try:
                index = vectors.index(model[word])
                tuples = [x[0] for x in index]
                test_ids[fileCounter][indexCounter] = tuples[0]
                print(test_ids[fileCounter][indexCounter])
            except KeyError:
                test_ids[fileCounter][indexCounter] = 399999 #Vector for unknown words
                print(test_ids[fileCounter][indexCounter])
        indexCounter = indexCounter + 1
        if indexCounter >= maxSeqLength:
            break
        fileCounter = fileCounter + 1
        if fileCounter == 20:
            break
    return test_ids

In [14]:
test_ids = vectorizeTest(model)
np.save('test_idsMatrix', test_ids)
test_ids = np.load('test_idsMatrix.npy')

[ "I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous! "
 'sadness']
72
280
32
4488
42
7766
38
2854
79
31
246
1620
57
72
2151
42
53181
882
163
23280
11826
36
6286
746
399999
21936
["So sleepy again and it's not even that late. I fail once again. "
 'sadness']
95
26008
431
3
45
38
139
20
329
72
5516
424
431
[ "I'm having a problem with my photo here in twitter amf!!!...can't see my face! "
 'sadness']
7766
384
6
599
24
609
3709
622
5
399999
399999
68
609
1249
['@Pokinatcha  in all honesty...pain   blech.' 'sadness']
399999
5
50
399999
399999
[ '@gcrush @nopantsdance i was just thinking about how excited i am for you guys to move, but then i realized how sad i am to see you go. '
 'sadness']
399999
399999
72
18
325
2854
79
311
10615
72
1226
14
207
12506
7
984
42
104
72
4715
311
11015
72
1226
7
68
207
686
['Allergies suck ducks nuts.     &lt;=====8@8=====&gt;' 'sadness']
28487
34161
12411
12036
399999
["*sigh

In [34]:
train_arrays = ids
train_labels = np.zeros(numFiles)

# create a numpy array of 0s and 1s for neg and pos

for i in range(len(np_sad_train)):
    train_labels[i] = 0
    train_labels[len(np_happy_train) + i] = 1

In [35]:
test_arrays = test_ids
test_labels = np.zeros(numFiles)

# create a numpy array of 0s and 1s for neg and pos

for i in range(len(np_sad_test)):
    test_labels[i] = 0
    test_labels[len(np_happy_test) + i] = 1

In [36]:
clf = LogisticRegression()
clf.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
clf.score(test_arrays, test_labels)

0.85224860065624397