# Predict the sentiment of various tweets using RNN
- Use Recurrent Neural networks to predict the sentiment of various tweets. We would like to predict the tweets as positive or negative. There are around 1600000 tweets to train


### Setup Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import all libraries
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers.convolutional import Conv1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from zipfile import ZipFile

import spacy
nlp=spacy.load("en")

Using TensorFlow backend.


In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Load Data

In [None]:
!ls -l 'drive/My Drive/AIML/SequentialModels'

total 744130
-rw------- 1 root root      2182 Apr 12 03:41  airline-passengers.csv
-rw------- 1 root root 743069095 Jun 26 12:31  glove.twitter.27B.200d.rar
-rw------- 1 root root    183904 Apr 12 07:48 'google (1).csv'
-rw------- 1 root root    280087 Jul  3 14:31  Lab0_Word2vec_gensim_movie_data.ipynb
-rw------- 1 root root    215350 Jun  1 05:37 'Lab1_TimeSeries - Air Passengers Traffic.ipynb'
-rw------- 1 root root    186016 Apr 12 03:41  Lab2_stockPriceDirectionPrediction_v2.ipynb
-rw------- 1 root root  13585269 Dec 10  2019  labeledTrainData.tsv.zip
-rw------- 1 root root      8544 Apr 12 03:41 'LSTM Sentiment Analysis Kaggle.ipynb'
-rw------- 1 root root    459888 Jun  1 05:54  Predict_Air_Passengers_Traffic_TS.ipynb
-rw------- 1 root root    469419 Jun  9 05:26  Predict_TractorSales_TS.ipynb
-rw------- 1 root root    481330 Jun  3 17:25  Predict_WSBSales_TS.ipynb
drwx------ 2 root root      4096 Jun  1 03:23  savedModels
-rw------- 1 root root      1763 Apr 12 07:48  Tractor-S

In [None]:
train = pd.read_csv('/content/drive/My Drive/AIML/SequentialModels/training.1600000.processed.noemoticon.csv',encoding= "latin-1")
train.shape

(19999, 6)

In [None]:
train.head(2)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


### Data Preprocessing

In [None]:
train.describe()

Unnamed: 0,0,1467810369
count,19999.0,19999.0
mean,2.0003,1836369000.0
std,2.00005,356596300.0
min,0.0,1467811000.0
25%,0.0,1468964000.0
50%,4.0,2191325000.0
75%,4.0,2192272000.0
max,4.0,2193602000.0


In [None]:
# We need only the first and sixth column
Y_train = train[train.columns[0]]
X_train = train[train.columns[5]]

In [None]:
X_train.head(2)

0    is upset that he can't update his Facebook by ...
1    @Kenichan I dived many times for the ball. Man...
Name: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D, dtype: object

In [None]:
Y_train.head(2)

0    0
1    0
Name: 0, dtype: int64

In [None]:
# Check for null values
X_train.isnull().sum(), Y_train.isnull().sum()

(0, 0)

In [None]:
# Check No of Positive and negative tweets and see if dataset is balanced

print(Y_train.value_counts())

4    10001
0     9998
Name: 0, dtype: int64


- The dataset is a balanced dataset with almost equal no of classes.So, the classifier we build will be balanced

In [None]:
# Find similar occurences...look for Generic treats lik Ok, Great, NA etc which have very high occurences
# This also helps in finding duplicate data or biased data

X_train.value_counts()

At work                                                                                                                                    5
boys! check out my url for more pics n vids n stuff about me! maybe we can go on a date one day                                            4
Good morning!                                                                                                                              4
Homework                                                                                                                                   3
nothing                                                                                                                                    3
                                                                                                                                          ..
@forsoothed  if i had it my way id spell it wensday but unfortunatly im not a roman emperor/viking god..so i have no say in the matter     1
@CandelaCande

- There are no generic tweets with high occurence

### Split Data

In [None]:
# split the data into test and train
# try to build the model by splitting in the ratio of 85:15 and 70:30

trainset1x, trainset2x, trainset1y, trainset2y = train_test_split(X_train.values, Y_train.values, test_size=0.02,random_state=42)

In [None]:
# print shapes of test column and target column of the train and test datasets

print('trainset1x Size : {}\t trainset1y Size: {}'.format(trainset1x.shape, trainset1y.shape))
print('trainset2x Size : {}\t trainset2y Size: {}'.format(trainset2x.shape, trainset2y.shape))

trainset1x Size : (19599,)	 trainset1y Size: (19599,)
trainset2x Size : (400,)	 trainset2y Size: (400,)


In [None]:
trainset2x[0:3]

array(['2 more Exams ', 'The first serial about my city so stupid... ',
       "@theroser You guys should do your acostic performance at 5! I have a championship meet tomorrow that won't end til then.  -Cami"],
      dtype=object)

In [None]:
# One-Hot encoding of target column, trainset2y

trainset2y = pd.get_dummies(trainset2y)

In [None]:
trainset2y.head(2)

Unnamed: 0,0,4
0,0,1
1,1,0


### Build Input and Output data
for both Training and Test based on the window or past history size

In [None]:
# function to remove stopwords
def stopwords(sentence):
  new=[]
  sentence=nlp(sentence)
  for w in sentence:
    if (w.is_stop == False) & (w.pos_ !="PUNCT"):
      new.append(w.string.strip())
    c=" ".join(str(x) for x in new)
  return c

In [None]:
# function to lemmatize the tweets
def lemmatize(sentence):
    sentence=nlp(sentence)
    #str=""
    #for w in sentence:
        #str+=" "+w.lemma_
    
    str = " ".join([token.lemma_ for token in sentence])

    return nlp(str)

In [None]:
# loading the glove model

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    print ("Done.", len(model), " words loaded!")
    return model

In [None]:
# project path in google drive
project_path = "/content/sample_data/"

# copy rar file of glove model
!cp '/content/drive/My Drive/AIML/SequentialModels/glove.twitter.27B.200d.rar' '/content/sample_data/'

In [None]:
# Extract glovemodel ... use unrar command in linux
!ls -l '/content/sample_data'
!unrar x '/content/sample_data/glove.twitter.27B.200d.rar' '/content/sample_data/'
!ls -l '/content/sample_data'

total 2790536
-rwxr-xr-x 1 root root       1697 Jan  1  2000 anscombe.json
-rw-r--r-- 1 root root     301141 Jun 26 16:26 california_housing_test.csv
-rw-r--r-- 1 root root    1706430 Jun 26 16:26 california_housing_train.csv
-rw------- 1 root root  743069095 Jul  4 12:28 glove.twitter.27B.200d.rar
-rw-r--r-- 1 root root 2057590469 Oct  2  2019 glove.twitter.27B.200d.txt
-rw-r--r-- 1 root root   18289443 Jun 26 16:26 mnist_test.csv
-rw-r--r-- 1 root root   36523880 Jun 26 16:26 mnist_train_small.csv
-rwxr-xr-x 1 root root        930 Jan  1  2000 README.md

UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from /content/sample_data/glove.twitter.27B.200d.rar


Would you like to replace the existing file /content/sample_data/glove.twitter.27B.200d.txt
2057590469 bytes, modified on 2019-10-02 11:14
with a new one
2057590469 bytes, modified on 2019-10-02 11:14

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit Y

Extracting  /content/sample_data/glove.twitter.27B.2

In [None]:
# save the glove model
model=loadGloveModel("/content/sample_data/glove.twitter.27B.200d.txt")

Loading Glove Model
Done. 1193514  words loaded!


In [None]:
model.keys()

In [None]:
# vectorising the sentences

def sent_vectorizer(sent, model):
  sent_vec = np.zeros(200)
  numw = 0
  for w in sent.split():
    try:
      sent_vec = np.add(sent_vec, model[str(w)])
      numw+=1
    except:
      pass
  print('sent_vector', len(sent_vec), numw)
  #print(sent)
  #print(sent_vec)
  return sent_vec

In [None]:
trainset2x.shape[0], len(model[str('serial')])

(400, 200)

In [None]:
# obtain a clean vector
cleanvector=[]
wordcount = 0
for i in range(trainset2x.shape[0]):
#for i in range(5):
  document=trainset2x[i]
  document=document.lower()
  document=lemmatize(document)
  document=str(document)
  print('document:',len(document))
  #sent_vec, numw = sent_vectorizer(document,model)
  #wordcount += numw
  #cleanvector.append(sent_vec)
  cleanvector.append(sent_vectorizer(document,model))
#print(wordcount)

document: 11
sent_vector 200 2
document: 48
sent_vector 200 7
document: 137
sent_vector 200 19
document: 60
sent_vector 200 14
document: 11
sent_vector 200 2
document: 55
sent_vector 200 7
document: 41
sent_vector 200 4
document: 39
sent_vector 200 5
document: 154
sent_vector 200 25
document: 55
sent_vector 200 10
document: 109
sent_vector 200 19
document: 132
sent_vector 200 29
document: 164
sent_vector 200 18
document: 139
sent_vector 200 18
document: 119
sent_vector 200 17
document: 15
sent_vector 200 3
document: 67
sent_vector 200 12
document: 116
sent_vector 200 17
document: 146
sent_vector 200 21
document: 45
sent_vector 200 7
document: 32
sent_vector 200 6
document: 20
sent_vector 200 3
document: 45
sent_vector 200 6
document: 100
sent_vector 200 14
document: 54
sent_vector 200 11
document: 98
sent_vector 200 14
document: 58
sent_vector 200 12
document: 46
sent_vector 200 3
document: 50
sent_vector 200 8
document: 37
sent_vector 200 7
document: 128
sent_vector 200 26
document: 4

In [None]:
print(len(cleanvector[0]), len(cleanvector))

200 400


In [None]:
cleanvector[0]

array([ 6.816300e-01, -2.447040e-01, -6.870000e-03,  4.331600e-01,
       -5.716700e-01,  5.122970e-01,  1.088720e+00,  8.401300e-01,
       -2.249460e-01,  2.337340e-01, -7.165000e-02, -1.013750e+00,
       -9.980880e-01, -4.669200e-01,  2.497200e-01, -5.321300e-01,
       -3.295340e-01, -5.199400e-02, -6.254940e-01,  2.776020e-01,
        1.940420e-01,  5.663500e-01, -3.304300e-02, -4.104510e-01,
       -6.340900e-01,  1.972100e+00,  8.124000e-02, -3.340700e-01,
        3.536710e-01,  4.532300e-01, -8.080000e-02,  4.568500e-02,
       -1.834900e-01,  2.230360e-01, -5.147600e-01, -4.783600e-02,
       -4.131900e-01,  1.167380e+00,  5.694200e-01,  2.634100e-01,
       -8.092300e-01, -6.390440e-01,  1.627730e-01, -8.994700e-01,
        9.904910e-01,  1.032830e-01, -3.475490e-01, -3.658410e-01,
        9.721000e-02,  1.864500e-01, -3.549790e-01,  5.843630e-01,
        8.601800e-02,  1.277450e-01, -2.143700e-01, -2.402200e-01,
       -1.996200e-02, -6.345300e-01, -3.206160e-01,  4.457300e

In [None]:
# Getting the input and output in proper shape
cleanvector=np.array(cleanvector)

# Rehape the clean vector to the input-shape required for RNN
# Shape of an RNN input is (No of documents or rows, length of each document vector, timestep that we want to take) = (400,200,1)
# timestep = 1 means take the next word, timestep =2 means skip one word and go to next word

cleanvector =cleanvector.reshape(len(cleanvector),200,1)
cleanvector.shape

(400, 200, 1)

In [None]:
# tokenizing the sequences
tokenizer = Tokenizer(num_words=16000)
tokenizer.fit_on_texts(trainset2x)
sequences = tokenizer.texts_to_sequences(trainset2x)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=15, padding="post")
print(data.shape)

Found 1873 unique tokens.
(400, 15)


In [None]:
# reshape the data and preparing to train

data=data.reshape(len(cleanvector),15,1)
print(data.shape)

(400, 15, 1)


In [None]:
# split the data in traina nd text and preparing to train

trainx, validx, trainy, validy = train_test_split(data, trainset2y, test_size=0.3,random_state=42 )
print(trainx.shape, trainy.shape)
print(validx.shape, validy.shape)

(280, 15, 1) (280, 2)
(120, 15, 1) (120, 2)


In [None]:
# calculate the number of words

nb_words=len(tokenizer.word_index)+1
print(nb_words)

1874


In [None]:
#obtain the embedding matrix

embedding_matrix = np.zeros((nb_words, 200))

for word, i in word_index.items():
  embedding_vector = model.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 346


In [None]:
trainy=np.array(trainy)
validy=np.array(validy)

In [None]:
#building a simple RNN model
def modelbuild():
  model = Sequential()

  # add input layer
  model.add(keras.layers.InputLayer(input_shape=(15,1)))

  # add embedding layer
  keras.layers.embeddings.Embedding(nb_words, 15, weights=[embedding_matrix], input_length=15, trainable=False)
 
 # RNN layer
  model.add(keras.layers.recurrent.SimpleRNN(units = 100, activation='relu', use_bias=True))

  #FCNN layer
  model.add(keras.layers.Dense(units=1000, input_dim = 2000, activation='sigmoid'))
  model.add(keras.layers.Dense(units=500, input_dim=1000, activation='relu'))
  model.add(keras.layers.Dense(units=2, input_dim=500,activation='softmax'))

  #compile the model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  
  return model

In [None]:
#build, compile and fit the model

finalmodel = modelbuild()
history = finalmodel.fit(trainx, trainy, epochs=10, batch_size=120,validation_data=(validx,validy))

Train on 280 samples, validate on 120 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Test the model

test = trainx[100:110]
test.shape

(10, 15, 1)

In [None]:
trainy[100:110]

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1]], dtype=uint8)

In [None]:
# Prediction using trained model

pred = finalmodel.predict(test)
pred

array([[0.73437774, 0.26562226],
       [0.44621414, 0.55378586],
       [0.55426884, 0.4457312 ],
       [0.4776404 , 0.52235955],
       [0.26188812, 0.7381119 ],
       [0.5367165 , 0.46328348],
       [0.8949959 , 0.10500411],
       [0.51098126, 0.48901874],
       [0.4639218 , 0.5360782 ],
       [0.39014342, 0.6098566 ]], dtype=float32)

In [None]:
# Display the classes

n = len(pred)
a=[]
for i in range(10):
  a.append(10)
for i in range(10):
  a[i] = np.argmax(pred[i])
a

[0, 1, 0, 1, 1, 0, 0, 0, 1, 1]

In [None]:
b=[]
for i in range(0, len(pred)):
  b.append(np.argmax(pred[i]))
b

[0, 1, 0, 1, 1, 0, 0, 0, 1, 1]