In [1]:
import os
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters


In [2]:
data_folder = './twitter-datasets/'

# Reading the data
positive_path = os.path.join(data_folder,'train_pos.txt')
negative_path = os.path.join(data_folder,'train_neg.txt')

In [3]:
lines_positive = [line.rstrip('\n') for line in open(positive_path)]
lines_negative = [line.rstrip('\n') for line in open(negative_path)]

# Data preparation

## Create DF

In [4]:
# Create dataFrame from positive tweets and give them value 1 as a sentiment
data_pos = pd.DataFrame({"tweets": lines_positive,
                      "sentiment":np.ones(len(lines_positive))
                      })

# Create dataFrame from negative tweets and give them value 0 as a sentiment
data_neg = pd.DataFrame({"tweets": lines_negative,
                      "sentiment":np.zeros(len(lines_negative))
                      })
# Concat both of them
data = pd.concat([data_pos,data_neg],axis=0).reset_index().drop(columns=['index'])

# Shuffle everything so that we don't have all the positives in one cluster and all the negatives in another
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,tweets,sentiment
0,<user> the fame this way,1.0
1,<user> what's the matter darling ? ?,0.0
2,they sending me home,1.0
3,had a great day catching up with friends yeste...,0.0
4,reflections on the psalms ( harvest book ) ( p...,0.0


## Clean up the text

In [24]:
# Remove <anything> from tweets.
data['tweets'].replace(regex=True,inplace=True,to_replace=r'<.*?>',value=r'')
data.head()

Unnamed: 0,tweets,sentiment
0,the fame this way,1.0
1,what's the matter darling ? ?,0.0
2,they sending me home,1.0
3,had a great day catching up with friends yeste...,0.0
4,reflections on the psalms ( harvest book ) ( p...,0.0


In [25]:
X = data['tweets'].tolist()
y =  data['sentiment']
# Split train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Use tokenizer
### From words to numbers

In [26]:
# map words to numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [52]:
num_words = len(tokenizer.word_index)
num_words

103093

In [28]:
tokenizer.word_index

{'i': 1,
 'the': 2,
 'to': 3,
 'you': 4,
 'a': 5,
 'and': 6,
 'my': 7,
 'me': 8,
 'of': 9,
 'is': 10,
 'for': 11,
 'in': 12,
 'it': 13,
 'this': 14,
 'so': 15,
 'with': 16,
 'on': 17,
 'that': 18,
 'be': 19,
 "i'm": 20,
 'have': 21,
 'but': 22,
 'just': 23,
 'rt': 24,
 'love': 25,
 'your': 26,
 'all': 27,
 'not': 28,
 'was': 29,
 'at': 30,
 'are': 31,
 'like': 32,
 'get': 33,
 '3': 34,
 'up': 35,
 'frame': 36,
 'lol': 37,
 'good': 38,
 'know': 39,
 'u': 40,
 'do': 41,
 'now': 42,
 'one': 43,
 'when': 44,
 'if': 45,
 'we': 46,
 'follow': 47,
 'no': 48,
 'can': 49,
 'go': 50,
 'what': 51,
 "don't": 52,
 'x': 53,
 "'": 54,
 'out': 55,
 'will': 56,
 'day': 57,
 '2': 58,
 'please': 59,
 '1': 60,
 'from': 61,
 'see': 62,
 'too': 63,
 'want': 64,
 'there': 65,
 'back': 66,
 "it's": 67,
 'today': 68,
 'about': 69,
 'really': 70,
 'how': 71,
 'got': 72,
 'thanks': 73,
 'time': 74,
 "can't": 75,
 'its': 76,
 'think': 77,
 'im': 78,
 'haha': 79,
 'going': 80,
 'he': 81,
 'as': 82,
 'miss': 83,
 '

In [32]:
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

In [33]:
print(X_train[0],' -> ',X_train_tokenized[0])

awesome ! thank you tv guide and yes - let's get dirty !   ->  [313, 108, 4, 647, 415, 6, 179, 491, 33, 1730]


In [53]:
data['length'] = data['tweets'].str.split().apply(len)
max_tokens = data['length'].max()
max_tokens

62

The maximum number of words used in a tweet is 64, we will pad every tweet to make it match this length

In [42]:

X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_tokens,padding='pre', truncating='pre')
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_tokens,padding='pre', truncating='pre')


### From numbers to words

In [48]:
def numbers_to_string(number_array, tokenizer):
    
    ''' 
    Transforms tokens to words
    :param number_array: The numbers array to transform
    :param tokenizer: The tokenizer used

    :return: the original text
    :rtype: String
    '''
        
    indices = tokenizer.word_index
    # Create a dict that mapes numbers to their respective words
    inverse_map = dict(zip(indices.values(), indices.keys()))
        
    # Maps the numbers back to words.
    words = []
    for number in number_array:
        if number != 0: # !=0 is to remove the padding
            words.append(inverse_map[number])
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [49]:
numbers_to_string(X_train_tokenized[0],tokenizer)

"awesome thank you tv guide and yes let's get dirty"

# Neural Network

In [55]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 62, 8)             824744    
_________________________________________________________________
gru_1 (GRU)                  (None, 62, 16)            1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 62, 8)             600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 826,705
Trainable params: 826,705
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.fit(X_train_padded, y_train,
          validation_split=0.1, epochs=100, batch_size=64)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 144000 samples, validate on 16000 samples
Epoch 1/100
  3136/144000 [..............................] - ETA: 17:45 - loss: 0.6916 - acc: 0.5293

KeyboardInterrupt: 