Ipython notebook for emotion detection from text using LSTM (Long short term memory.
The first step is the data preprocessing of the corwdflower dataset.


In [None]:
#Importing the required libraries
import pandas as pd
import keras
import numpy as np
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,LSTM,Input,Bidirectional
from sklearn.model_selection import cross_val_score 
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
import re

Mounting the google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importing the datasets from the drive

In [None]:
df=pd.read_csv('/content/drive/My Drive/text_emotion.csv')
print(len(df))

40000


Visualising the dataset

In [None]:
print(df.head())

     tweet_id  ...                                            content
0  1956967341  ...  @tiffanylue i know  i was listenin to bad habi...
1  1956967666  ...  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696  ...                Funeral ceremony...gloomy friday...
3  1956967789  ...               wants to hang out with friends SOON!
4  1956968416  ...  @dannycastillo We want to trade with someone w...

[5 rows x 4 columns]


The tweet_id and author name are of no use to us. So drop these columns

In [None]:
df=df.drop(['tweet_id','author'], axis = 1) 
print(df.head())

    sentiment                                            content
0       empty  @tiffanylue i know  i was listenin to bad habi...
1     sadness  Layin n bed with a headache  ughhhh...waitin o...
2     sadness                Funeral ceremony...gloomy friday...
3  enthusiasm               wants to hang out with friends SOON!
4     neutral  @dannycastillo We want to trade with someone w...


Now, we have all the required columns. Now, using NLP for text preprocessing

In [None]:
# Removing URL from the tweets

df['content'] = df['content'].str.replace(re.compile('http\S+'),'')

In [None]:
# Removing words which starts with '@'(mentioning a user or page) in the tweets

df['content'] = df['content'].str.replace(re.compile('@\w+'),'')

In [None]:
# Removing words which starts with '#'(representing any trend) in the tweets

df['content'] = df['content'].str.replace(re.compile('#\w+'),'')

In [None]:
# phrases cleaning & punctuation removal

import re
def sentence_cleaning(sentence):
    sentence=re.sub('\'d',' would',sentence)
    sentence=re.sub('\'ll',' will',sentence)
    sentence=re.sub('\'ve',' have',sentence)
    sentence=re.sub('\'s',' is',sentence)
    sentence=re.sub('n\'t',' not',sentence)
    sentence=re.sub("won't",' will not',sentence)
    sentence=re.sub("can't",' cannot',sentence)
    sentence=re.sub("ain't",' am not',sentence)
    sentence=re.sub('\W',' ',sentence) #'\W' to remove all non-alphanumeric characters(punctuations)
    sentence=sentence.lower()
    return sentence

In [None]:
df['content'] = df['content'].apply(sentence_cleaning)
df.head()

Unnamed: 0,sentiment,content
0,empty,i know i was listenin to bad habit earlier a...
1,sadness,layin n bed with a headache ughhhh waitin o...
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,wants to hang out with friends soon
4,neutral,we want to trade with someone who has houston...


 Removing stopwords from the tweets and reducing each word to its lemma

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm

In [None]:
def stop_word_removal(words):
    cleaned_line=[]
    for i in words:
        if i not in stopwords.words('english'):
            cleaned_line.append(i)
    return cleaned_line

In [None]:
n_phrase = []

for x in tqdm(df['content']):    
    word_tokens = word_tokenize(x)
    
    # lemmatizing each word in the list
    
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i) for  i in word_tokens]
    
    # stop word removal
    cleaned_text = stop_word_removal(lemma)
    
    text = " ".join(cleaned_text)
    n_phrase.append(text)

100%|██████████| 40000/40000 [01:07<00:00, 588.89it/s]


In [None]:
df.loc[:,'content'] = n_phrase
df.head()

Unnamed: 0,sentiment,content
0,empty,know wa listenin bad habit earlier started fre...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,want trade someone ha houston ticket one


## Text Vectorization

creating word to indices

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding

In [None]:
texts = df['content']

In [None]:
# Indexing words

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_indices = tokenizer.texts_to_sequences(texts)
word_map = tokenizer.word_index

# print(word_indices)
# print(word_map)

[[1, 56, 1, 26, 3153, 2, 120, 4441, 817, 9, 1, 588, 1124, 27, 176, 535], [8879, 259, 142, 23, 5, 415, 3385, 2170, 16, 41, 282], [2643, 4009, 1698, 228], [426, 2, 681, 33, 23, 206, 168], [48, 82, 2, 3154, 23, 244, 166, 100, 2394, 623, 19, 38, 53, 30], [94, 5813, 113, 67, 10, 7, 40, 2, 831, 1350, 6, 1456, 67, 10, 37, 6, 206], [1, 126, 25, 133, 19, 60, 10, 358, 61, 91, 224, 226, 166, 1, 82, 19, 88, 4, 1544, 32, 236, 66, 88, 426, 17, 83, 8880], [903, 4, 171], [12804, 6, 44, 1, 93, 7], [1, 20, 119, 27, 337, 8, 4, 228], [180, 771, 611], [6973, 16, 101, 8881], [276, 1, 13, 2, 1077, 31, 371, 290, 2, 35, 2, 3, 144, 1928], [73, 55, 210, 3, 2086, 11, 742, 55, 30, 2786, 50, 12805, 8, 4, 196, 143, 435, 9, 435, 318, 1, 21, 210, 12806], [45, 3, 489], [3, 1457, 4, 90, 9, 3, 4442, 4, 302], [2504], [18, 935, 110, 9, 8, 4, 10, 158, 15, 318, 1, 589, 504, 110], [718, 2787, 1792, 61, 10, 172, 1587, 124, 101, 437, 12807, 21, 18, 7, 56], [77, 36, 7, 3155, 15, 1, 13, 187, 363, 7, 50, 12808, 67, 1, 332, 87, 236

In [None]:
seq_length = max(word_indices,key = lambda x: len(x))
print(len(seq_length))

# maximum length of any sequence of words be 50
seq_length = 50

45


In [None]:
# padding words

X_data = pad_sequences(word_indices,padding='pre',maxlen=seq_length)

# print(X_data[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    1   56    1   26 3153    2  120 4441
  817    9    1  588 1124   27  176  535]


## Loading Pre-trained GloVe

We are using the 50D GloVe vectors hence each word is represented in a 50D embbedding space

In [None]:
# to store GloVe vectors
embeddings_dict = {}

with open("/content/drive/My Drive/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
f.close()

print('====successfully loaded======')



Building Word Embeddings

In [None]:
embedding_matrix = np.zeros((len(word_map) + 1,50))


for word, idx in word_map.items():
    vector = embeddings_dict.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector

print("matrix shape : ",embedding_matrix.shape)

matrix shape :  (29962, 50)


In [None]:
embedding_layer = Embedding(len(word_index) + 1,50, weights=[embedding_matrix],input_length=seq_length)

In [None]:
# implementing the model