In [41]:
import os
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [42]:
data_folder = './twitter-datasets/'

# Reading the data
positive_path = os.path.join(data_folder,'train_pos.txt')
negative_path = os.path.join(data_folder,'train_neg.txt')

In [43]:
lines_positive = [line.rstrip('\n') for line in open(positive_path)]
lines_negative = [line.rstrip('\n') for line in open(negative_path)]

# Data preparation

## Create DF

In [44]:
# Create dataFrame from positive tweets and give them value 1 as a sentiment
data_pos = pd.DataFrame({"tweets": lines_positive,
                      "sentiment":np.ones(len(lines_positive))
                      })

# Create dataFrame from negative tweets and give them value 0 as a sentiment
data_neg = pd.DataFrame({"tweets": lines_negative,
                      "sentiment":np.zeros(len(lines_negative))
                      })
# Concat both of them
data = pd.concat([data_pos,data_neg],axis=0).reset_index().drop(columns=['index'])

# Shuffle everything so that we don't have all the positives in one cluster and all the negatives in another
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,tweets,sentiment
0,<user> <user> ohhh <user> wants to know when t...,1.0
1,this was such a fun makeup to do - loved this ...,1.0
2,<user> oh i did thumbs up,0.0
3,glad to be in my air conditioned home,1.0
4,<user> i love you claudia kay ! keep your pret...,1.0


## Clean up the text

In [45]:
# Remove <user> from tweets.
data['tweets'].replace(regex=True,inplace=True,to_replace=r'<user>',value=r'')
data.head()

Unnamed: 0,tweets,sentiment
0,ohhh wants to know when the baby shower is,1.0
1,this was such a fun makeup to do - loved this ...,1.0
2,oh i did thumbs up,0.0
3,glad to be in my air conditioned home,1.0
4,i love you claudia kay ! keep your pretty hea...,1.0


## Use tokenizer

In [57]:
# map words to numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tweets'])

In [58]:
len(tokenizer.word_index)

103130

In [60]:
tokenizer.word_index

{'i': 1,
 'the': 2,
 'to': 3,
 'you': 4,
 'url': 5,
 'a': 6,
 'and': 7,
 'my': 8,
 'me': 9,
 'of': 10,
 'is': 11,
 'for': 12,
 'in': 13,
 'it': 14,
 'this': 15,
 'so': 16,
 'with': 17,
 'on': 18,
 'that': 19,
 'be': 20,
 'have': 21,
 "i'm": 22,
 'but': 23,
 'just': 24,
 'rt': 25,
 'love': 26,
 'your': 27,
 'all': 28,
 'not': 29,
 'was': 30,
 'at': 31,
 'are': 32,
 'like': 33,
 '3': 34,
 'get': 35,
 'up': 36,
 'frame': 37,
 'lol': 38,
 'good': 39,
 'know': 40,
 'u': 41,
 'do': 42,
 'now': 43,
 'one': 44,
 'when': 45,
 'if': 46,
 'we': 47,
 'follow': 48,
 'no': 49,
 'can': 50,
 'go': 51,
 'what': 52,
 "don't": 53,
 'x': 54,
 "'": 55,
 'out': 56,
 'will': 57,
 'day': 58,
 '2': 59,
 'please': 60,
 '1': 61,
 'from': 62,
 'see': 63,
 'too': 64,
 'want': 65,
 'there': 66,
 'back': 67,
 "it's": 68,
 'today': 69,
 'about': 70,
 'really': 71,
 'how': 72,
 'got': 73,
 'thanks': 74,
 'time': 75,
 "can't": 76,
 'its': 77,
 'think': 78,
 'im': 79,
 'haha': 80,
 'going': 81,
 'he': 82,
 'as': 83,
 'm