## Setup data

In [1]:
import pandas as pd

In [2]:
PATH = "data/sentiment_data/"

In [3]:
raw = pd.read_csv(f'{PATH}training.csv', 
                  low_memory=False, 
                  encoding='latin1', 
                  header=None, 
                  names=['sentiment', 'tweet_id', 'date', 'no_query', 'user', 'tweet'])

In [4]:
raw.head()

Unnamed: 0,sentiment,tweet_id,date,no_query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
raw.count()

sentiment    1600000
tweet_id     1600000
date         1600000
no_query     1600000
user         1600000
tweet        1600000
dtype: int64

In [6]:
raw['sentiment'] = raw['sentiment'].astype('category')

In [7]:
raw['sentiment'].describe()

count     1600000
unique          2
top             4
freq       800000
Name: sentiment, dtype: int64

In [8]:
raw.sentiment.unique()

[0, 4]
Categories (2, int64): [0, 4]

In [9]:
raw.sentiment.replace(to_replace={0: 0, 4: 1}, inplace=True)

In [10]:
raw.sentiment = raw.sentiment.astype('category')

In [11]:
raw.sentiment.unique()

[0, 1]
Categories (2, int64): [0, 1]

In [12]:
raw.drop(['date', 'no_query', 'user'], axis=1, inplace=True)

In [13]:
raw.head()

Unnamed: 0,sentiment,tweet_id,tweet
0,0,1467810369,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,is upset that he can't update his Facebook by ...
2,0,1467810917,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,my whole body feels itchy and like its on fire
4,0,1467811193,"@nationwideclass no, it's not behaving at all...."


In [14]:
raw = raw.sample(frac=1).reset_index(drop=True)

In [15]:
raw.head()

Unnamed: 0,sentiment,tweet_id,tweet
0,0,2050293907,@sabeeh90 boy stuff...i'd rather not discuss d...
1,0,1881730471,has a poorle baby to look after
2,0,1973760332,Ahhh twits I'm charging my phone in the bathro...
3,0,2064532491,I feel icky Exams tomorrow.
4,0,2180351236,"aww, man! I just realized the Moaning Myrtles ..."


## Preprocess data

In [16]:
tweets = raw.tweet.copy()
type(tweets)

pandas.core.series.Series

In [17]:
import numpy as np
labels = np.array(raw.sentiment.copy())
labels.shape

(1600000,)

In [18]:
raw = None

In [19]:
for i, t in enumerate(tweets):
    if i % 100000 == 0: print(i)
    tweets[i] = t.encode('latin1').decode('utf-8', errors='ignore')

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [20]:
from tep.tweetPreprocessor import tokenize
tokenized_texts = []
for t in tweets:
    tokenized_texts.append(tokenize(t))
tweets = None

In [21]:
from tep.utils import save_array, save_as_text
save_array(labels, f'{PATH}labels.bc')
save_as_text(tokenized_texts, f'{PATH}tweets.txt')

## Create word embedding

In [22]:
from tep.embeddingGenerator import EmbeddingGenerator
eg = EmbeddingGenerator()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [23]:
word_index = eg.generate_word_index(tokenized_texts)
len(word_index)

432240

In [24]:
seq_32 = eg.generate_sequences(texts=tokenized_texts, maxlen=32)
seq_48 = eg.generate_sequences(texts=tokenized_texts, maxlen=48)
print(seq_32.shape)
print(seq_48.shape)

(1600000, 32)
(1600000, 48)


In [25]:
seq_32[:5]

array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      2,    558,    303,      1,
          9088,    792,     31,   5231,   2235,     20,    113,      1],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,    106,      9, 150999,    289,      5,    222,    160],
       [     0,      0,      0,      0,      0,    320,     33,   3439,
            25,   5415,     10,    221,     16,      6,   2393,    123,
            36,     30,      6,   1154,    610,      3,     94,   3831,
            26,     34,     13,      5,    120,    117,    567,     59],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      

In [26]:
save_array(seq_32, f'{PATH}seq_32.bc')
save_array(seq_48, f'{PATH}seq_48.bc')

In [27]:
# create 100-dimensional embedding
emb_index = eg.load_pretrained_embedding(filename="glove/glove.twitter.27B.100d.txt")
print(len(emb_index))
emb_matrix = eg.generate_embedding_matrix()
print(emb_matrix.shape)
save_array(array=emb_matrix, filename=f'{PATH}emb_mat_100dim.bc')

1193514
(432241, 100)


In [28]:
import json
with open(f'{PATH}word_index.json', 'w') as fp:
    json.dump(word_index, fp)

## Build sentiment model

In [29]:
from tep.utils import load_array
seqs = load_array(f'{PATH}seq_32.bc')
print(seqs.shape)
emb_mat = load_array(f'{PATH}emb_mat_100dim.bc')
print(emb_mat.shape)
labels = load_array(f'{PATH}labels.bc')
print(labels.shape)

(1600000, 32)
(432241, 100)
(1600000,)
