# LSTM
The aim of this notebook is implementing LSTM for sentyment analysis of tweets.



# 1. Setup

## 1.1 Imports

In [19]:
# Imports 

import numpy as np
import pandas as pd

import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

import re

## 1.2 Define some useful functions

In [5]:
def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)
    return np.array(tweets)

In [6]:
def load_test_tweets(filename):
  tweets = []
  with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
  return tweets

## 1.2 Setup input and output paths

In [7]:
input_data_path = '../input/cil-processed-dataset/'
output_data_path = 'submissions/'

## 1.3 Set an LSTM type

In [8]:
lstm_out = 196

In [14]:
# If you wish to use a bidirectional LSTM set bidirectional_LSTM = True
# If you wish to use a vanilla LSTM set bidirectional_LSTM = False
bidirectional_LSTM = True

#If you wish to stack two LSTM of the type defined before set stacked_LSTM = True
stacked_LSTM = False

if(bidirectional_LSTM):
    LSTMLayer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    LSTMLayer_stacked = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
else:
    LSTMLayer = LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)
    LSTMLayer_stacked = LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)

## 1.4 Load tweets from the dataset

In [15]:
tweets = []
labels = []
load_tweets(input_data_path + 'processed_neg.txt', 0)
load_tweets(input_data_path + 'processed_pos.txt', 1)

# Convert to NumPy array
tweets = np.array(tweets)
labels = np.array(labels)

print(f'{len(tweets)} tweets loaded')

Check now the length of the tweets loaded. 

In particular, remind that:
- Small training set consists on 200.000 tweets
- Complete training set consists on 2.500.000 tweets

In [16]:
data = pd.DataFrame(columns = ['text', 'sentiment'], data=np.array([tweets, labels]).T)

# 2. Tokenize the data

In [17]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)


In [20]:
lens = []
for el in X:
  lens.append(len(el))

sns.displot(np.array(lens), color = 'orange')

In [21]:
X = pad_sequences(X, maxlen = 25)

# 3. Define the model

In [17]:
embed_dim = 100
input_length = X.shape[1]

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = input_length))
model.add(SpatialDropout1D(0.4))
if stacked_LSTM:
    model.add(LSTMLayer_stacked)  
model.add(LSTMLayer)
model.add(Dense(2,activation='softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [18]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size = 0.1, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

# 4. Train the model 

In [None]:
batch_size = 32
history = model.fit(X_train, Y_train, epochs = 3, validation_data = [X_val, Y_val], 
                    steps_per_epoch = 100, batch_size = batch_size, verbose = 1)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)
#plt.savefig('training.png')

# 5. Make submissions

In [None]:
X_test = load_test_tweets(input_data_path + 'processed_test_no_idx.txt')
tokenized_test = tokenizer.texts_to_sequences(X_test)
padded = pad_sequences(tokenized_test, maxlen=X.shape[1], dtype='int32', value=0)

In [None]:
sentiments = model.predict(padded,batch_size=len(X_test))

In [None]:
predictions = []
for sent in sentiments:
  pred = np.argmax(sent)
  if (pred == 0):
    predictions.append(-1)
  else:
    predictions.append(1)

df = pd.DataFrame(np.array(predictions), columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv(output_data_path + "submission_LSTM.csv")