Ipython notebook for emotion detection from text using LSTM (Long short term memory.
The first step is the data preprocessing of the corwdflower dataset.


In [None]:
#Importing the required libraries
import pandas as pd
import keras
import numpy as np
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,LSTM,Input,Bidirectional
from sklearn.model_selection import cross_val_score 
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
import re

Mounting the google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importing the datasets from the drive

In [None]:
df=pd.read_csv('/content/drive/My Drive/text_emotion.csv')
print(len(df))

40000


Visualising the dataset

In [None]:
print(df.head())

     tweet_id  ...                                            content
0  1956967341  ...  @tiffanylue i know  i was listenin to bad habi...
1  1956967666  ...  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696  ...                Funeral ceremony...gloomy friday...
3  1956967789  ...               wants to hang out with friends SOON!
4  1956968416  ...  @dannycastillo We want to trade with someone w...

[5 rows x 4 columns]


The tweet_id and author name are of no use to us. So drop these columns

In [None]:
df=df.drop(['tweet_id','author'], axis = 1) 
print(df.head())

    sentiment                                            content
0       empty  @tiffanylue i know  i was listenin to bad habi...
1     sadness  Layin n bed with a headache  ughhhh...waitin o...
2     sadness                Funeral ceremony...gloomy friday...
3  enthusiasm               wants to hang out with friends SOON!
4     neutral  @dannycastillo We want to trade with someone w...


Now, we have all the required columns. Now, using NLP for text preprocessing

In [None]:
# Removing URL from the tweets

df['content'] = df['content'].str.replace(re.compile('http\S+'),'')

In [None]:
# Removing words which starts with '@'(mentioning a user or page) in the tweets

df['content'] = df['content'].str.replace(re.compile('@\w+'),'')

In [None]:
# Removing words which starts with '#'(representing any trend) in the tweets

df['content'] = df['content'].str.replace(re.compile('#\w+'),'')

In [None]:
# phrases cleaning & punctuation removal

import re
def sentence_cleaning(sentence):
    sentence=re.sub('\'d',' would',sentence)
    sentence=re.sub('\'ll',' will',sentence)
    sentence=re.sub('\'ve',' have',sentence)
    sentence=re.sub('\'s',' is',sentence)
    sentence=re.sub('n\'t',' not',sentence)
    sentence=re.sub("won't",' will not',sentence)
    sentence=re.sub("can't",' cannot',sentence)
    sentence=re.sub("ain't",' am not',sentence)
    sentence=re.sub('\W',' ',sentence) #'\W' to remove all non-alphanumeric characters(punctuations)
    sentence=sentence.lower()
    return sentence

In [None]:
df['content'] = df['content'].apply(sentence_cleaning)
df.head()

Unnamed: 0,sentiment,content
0,empty,i know i was listenin to bad habit earlier a...
1,sadness,layin n bed with a headache ughhhh waitin o...
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,wants to hang out with friends soon
4,neutral,we want to trade with someone who has houston...


 Removing stopwords from the tweets and reducing each word to its lemma

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm

In [None]:
def stop_word_removal(words):
    cleaned_line=[]
    for i in words:
        if i not in stopwords.words('english'):
            cleaned_line.append(i)
    return cleaned_line

In [None]:
n_phrase = []

for x in tqdm(df['content']):    
    word_tokens = word_tokenize(x)
    
    # lemmatizing each word in the list
    
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i) for  i in word_tokens]
    
    # stop word removal
    cleaned_text = stop_word_removal(lemma)
    
    text = " ".join(cleaned_text)
    n_phrase.append(text)

100%|██████████| 40000/40000 [01:08<00:00, 582.21it/s]


In [None]:
df.loc[:,'content'] = n_phrase
df.head()

Unnamed: 0,sentiment,content
0,empty,know wa listenin bad habit earlier started fre...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,want trade someone ha houston ticket one
