Ipython notebook for emotion detection from text using LSTM (Long short term memory.
The first step is the data preprocessing of the corwdflower dataset.


In [None]:
#Importing the required libraries
import pandas as pd
import keras
import numpy as np
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,LSTM,Input,Bidirectional
from sklearn.model_selection import cross_val_score 
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
import re

Mounting the google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing the datasets from the drive

In [None]:
df=pd.read_csv('/content/drive/My Drive/text_emotion.csv')
print(len(df))

40000


Visualising the dataset

In [None]:
print(df.head())

     tweet_id  ...                                            content
0  1956967341  ...  @tiffanylue i know  i was listenin to bad habi...
1  1956967666  ...  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696  ...                Funeral ceremony...gloomy friday...
3  1956967789  ...               wants to hang out with friends SOON!
4  1956968416  ...  @dannycastillo We want to trade with someone w...

[5 rows x 4 columns]


The tweet_id and author name are of no use to us. So drop these columns

In [None]:
df=df.drop(['tweet_id','author'], axis = 1) 
print(df.head())

    sentiment                                            content
0       empty  @tiffanylue i know  i was listenin to bad habi...
1     sadness  Layin n bed with a headache  ughhhh...waitin o...
2     sadness                Funeral ceremony...gloomy friday...
3  enthusiasm               wants to hang out with friends SOON!
4     neutral  @dannycastillo We want to trade with someone w...


Now, we have all the required columns. Now, using NLP for text preprocessing

In [None]:
# Removing URL from the tweets

df['content'] = df['content'].str.replace(re.compile('http\S+'),'')

In [None]:
# Removing words which starts with '@'(mentioning a user or page) in the tweets

df['content'] = df['content'].str.replace(re.compile('@\w+'),'')

In [None]:
# Removing words which starts with '#'(representing any trend) in the tweets

df['content'] = df['content'].str.replace(re.compile('#\w+'),'')

In [None]:
# phrases cleaning & punctuation removal

import re
def sentence_cleaning(sentence):
    sentence=re.sub('\'d',' would',sentence)
    sentence=re.sub('\'ll',' will',sentence)
    sentence=re.sub('\'ve',' have',sentence)
    sentence=re.sub('\'s',' is',sentence)
    sentence=re.sub('n\'t',' not',sentence)
    sentence=re.sub("won't",' will not',sentence)
    sentence=re.sub("can't",' cannot',sentence)
    sentence=re.sub("ain't",' am not',sentence)
    sentence=re.sub('\W',' ',sentence) #'\W' to remove all non-alphanumeric characters(punctuations)
    sentence=sentence.lower()
    return sentence

In [None]:
df['content'] = df['content'].apply(sentence_cleaning)
df.head()

Unnamed: 0,sentiment,content
0,empty,i know i was listenin to bad habit earlier a...
1,sadness,layin n bed with a headache ughhhh waitin o...
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,wants to hang out with friends soon
4,neutral,we want to trade with someone who has houston...


 Removing stopwords from the tweets and reducing each word to its lemma

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm

In [None]:
def stop_word_removal(words):
    cleaned_line=[]
    for i in words:
        if i not in stopwords.words('english'):
            cleaned_line.append(i)
    return cleaned_line

In [None]:
n_phrase = []

for x in tqdm(df['content']):    
    word_tokens = word_tokenize(x)
    
    # lemmatizing each word in the list
    
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i) for  i in word_tokens]
    
    # stop word removal
    cleaned_text = stop_word_removal(lemma)
    
    text = " ".join(cleaned_text)
    n_phrase.append(text)

100%|██████████| 40000/40000 [00:58<00:00, 687.76it/s]


In [None]:
df.loc[:,'content'] = n_phrase
df.head()

Unnamed: 0,sentiment,content
0,empty,know wa listenin bad habit earlier started fre...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,want trade someone ha houston ticket one


## Text Vectorization

creating word to indices

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding

In [None]:
texts = df['content']

In [None]:
# Indexing words

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_indices = tokenizer.texts_to_sequences(texts)
word_map = tokenizer.word_index

# print(word_indices)
# print(word_map)

In [None]:
seq_length = max(word_indices,key = lambda x: len(x))
print(len(seq_length))

# maximum length of any sequence of words be 50
seq_length = 50

40


In [None]:
# padding words

X_data = pad_sequences(word_indices,padding='pre',maxlen=seq_length)

# print(X_data[0])

In [None]:
# Preparing the output data

In [None]:
y = df['sentiment'].unique()
print(y)

['empty' 'sadness' 'enthusiasm' 'neutral' 'worry' 'surprise' 'love' 'fun'
 'hate' 'happiness' 'boredom' 'relief' 'anger']


In [None]:
y_map = {} 
for i in range(len(y)):
  y_map[y[i]] = i

print(y_map)

{'empty': 0, 'sadness': 1, 'enthusiasm': 2, 'neutral': 3, 'worry': 4, 'surprise': 5, 'love': 6, 'fun': 7, 'hate': 8, 'happiness': 9, 'boredom': 10, 'relief': 11, 'anger': 12}


In [None]:
df.sentiment = df.sentiment.map(y_map)

In [None]:
# Generating one hot encoded data for the sentiment from the above labeled data
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

res = enc.fit_transform(df[['sentiment']]).toarray()

print(res.shape)
res

(40000, 13)


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
Y_data = res
print(Y_data)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Loading Pre-trained GloVe

We are using the 50D GloVe vectors hence each word is represented in a 50D embbedding space

In [None]:
# to store GloVe vectors
embeddings_dict = {}

with open("/content/drive/My Drive/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
f.close()

print('====successfully loaded======')



Building Word Embeddings

In [None]:
embedding_matrix = np.zeros((len(word_map) + 1,50))


for word, idx in word_map.items():
    vector = embeddings_dict.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector

print("matrix shape : ",embedding_matrix.shape)

matrix shape :  (27429, 50)


In [None]:
embedding_layer = Embedding(len(word_map) + 1,50, weights=[embedding_matrix],input_length=seq_length)

In [None]:
# implementing the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Flatten

In [None]:
model= Sequential()
model.add(embedding_layer)
model.add(LSTM(units = 100,return_sequences=True))
model.add(Flatten())
model.add(Dense(units = 520,activation='relu'))
model.add(Dense(units = 260,activation='relu'))
model.add(Dense(len(y_data),activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="sgd",metrics=["accuracy"])

In [None]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            1371450   
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
flatten_3 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 520)               2600520   
_________________________________________________________________
dense_9 (Dense)              (None, 260)               135460    
_________________________________________________________________
dense_10 (Dense)             (None, 13)                3393      
Total params: 4,171,223
Trainable params: 4,171,223
Non-trainable params: 0
____________________________________________

In [None]:
# splitting the dataset for training 
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_data,Y_data,train_size=0.8,random_state=0)

In [None]:
print("X Training data: ",x_train.shape)
print("Y training data: ",y_train.shape)
print("X test data",x_test.shape)
print("Y test data",y_test.shape)

X Training data:  (32000, 50)
Y training data:  (32000, 13)
X test data (8000, 50)
Y test data (8000, 13)


In [None]:
x_val = x_train[:8000,:]
y_val = y_train[:8000,:]
x_train = x_train[8000:,:]
y_train = y_train[8000:,:]
print("validation input data:", x_val.shape)

validation input data: (8000, 50)


In [None]:
model.fit(x_train,y_train,validation_data = (x_val,y_val),batch_size = 128,epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f38c2239d30>

In [None]:
scores = model.evaluate(x_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.7926250100135803


In [None]:
# Preparing input to predict

def prepare(s):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts([s])
  word_indices = tokenizer.texts_to_sequences([s])
  X = pad_sequences(word_indices,padding='pre',maxlen=seq_length)
  arr = model.predict(X)
  return arr.argmax()

In [None]:
s = input()
out = prepare(s)
print(out)

In [None]:
fuy_map

{'anger': 12,
 'boredom': 10,
 'empty': 0,
 'enthusiasm': 2,
 'fun': 7,
 'happiness': 9,
 'hate': 8,
 'love': 6,
 'neutral': 3,
 'relief': 11,
 'sadness': 1,
 'surprise': 5,
 'worry': 4}