In [1]:
#import libraries
from DiscoverData import DiscovData
from DataPreparation import DataPreparation

from DLModeling import Modeling
from MLModeling import DecisionTreeModel
from MLModeling import LogisticRegressionModel

import pandas as pd # For DataFrame and handling
import seaborn as sns # High level plotting

#to split the dataset into random train and test subsets
from sklearn.model_selection import train_test_split

#NLP
import nltk
#from textblob import TextBlob

#Keras
import keras
from  keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#Modeling
from keras.models import Sequential
from keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout
 

In [2]:
#Loading data
#https://www.kaggle.com/wcukierski/enron-email-dataset
emails = pd.read_csv('emails.csv', skiprows=lambda x:x%9)


In [3]:
#to see how emails look like
print(emails['message'][1])



Message-ID: <7391389.1075855378477.JavaMail.evans@thyme>
Date: Fri, 4 May 2001 11:26:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.heizenrader@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Heizenrader <Tim Heizenrader/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Tim,

mike grigsby is having problems with accessing the west power site.  Can you please make sure he has an active password.  

Thank you,

Phillip


In [4]:
#object of class DataPreparation
prep = DataPreparation(emails)

#to extract emails' body then add it to a new column
emails['Email'] = prep.bodyExtraction(emails['message'])

#labeling, adding label to each column.
emails['Sentiment'] = prep.labeling(emails['Email'])

#creating a separate dataset with just two columns body and sentiment
df = prep.newData(emails['Email'], emails['Sentiment'])


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57489 entries, 0 to 57488
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Email      57489 non-null  object
 1   Sentiment  57489 non-null  object
dtypes: object(2)
memory usage: 898.4+ KB


In [6]:
#after extracting the body
print(df['Email'][1])

Tim,

mike grigsby is having problems with accessing the west power site.  Can you please make sure he has an active password.  

Thank you,

Phillip


In [7]:
#split into train test sets
X = df['Email'] #Extracting data attributes
y = df['Sentiment'] # Extracting target/class labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


In [8]:
# Create an object of DiscovData class inside DiscoverData.py file.
dis = DiscovData(df)

In [9]:
# To figure out the target distribution
dis.targDistribut(y_train)

normal      16477
joy         12845
anger        8464
sadness      4337
fear          625
love          187
surprise      181
Name: Sentiment, dtype: int64

In [25]:
# to plot training and testing data 
dis.sentimentPlolt(y_train, y_test)

NameError: name 'plt' is not defined

In [10]:
# to find missing values:
dis.isNan()

Email        0
Sentiment    0
dtype: int64

In [37]:
# print head
dis.firstObs()

Unnamed: 0,Email,Sentiment
0,1. login: pallen pw: ke9davis\n\n I don't thi...,normal
1,"Tim,\n\nmike grigsby is having problems with a...",sadness
2,---------------------- Forwarded by Phillip K ...,anger
3,---------------------- Forwarded by Phillip K ...,normal
4,"Jeff,\n\n I need to see the site plan for Burn...",joy


In [38]:
#statistics:
dis.discoverData(X_train)

Unnamed: 0,0
count,43116.0
mean,1839.024
std,11347.88
min,1.0
25%,285.0
50%,774.0
75%,1762.0
max,2011422.0


In [11]:
#Sentiment Analysis Preparing 
#Using The tokenizer Class to convert the sentences into word vectors

tokenizer=Tokenizer(199431,lower=True,oov_token='UNK')
tokenizer.fit_on_texts(X_train)
len(tokenizer.word_index)

# training preparation:
Xtrain =  tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(Xtrain, maxlen=80,padding='post')
ytrain = y_train.replace({'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5, 'normal':6})
Ytrain= ytrain.values

#One hot Encoding the Emotion Values
Y_train_f=to_categorical(Ytrain) #Converts a class vector (integers) to binary class matrix.

# valedation preparation:
ytest= y_test.replace({'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5, 'normal':6})
X_val_f=tokenizer.texts_to_sequences(X_test)
X_val_pad=pad_sequences(X_val_f,maxlen=80,padding='post')
Y_val_f=to_categorical(ytest)

In [12]:
#Deep Learning
dModel = Modeling(df)
history = dModel.LSTM( X_train_pad,Y_train_f, X_val_pad,Y_val_f)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 48506, 64)         12798272  
_________________________________________________________________
dropout (Dropout)            (None, 48506, 64)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 48506, 160)        92800     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 320)               410880    
_________________________________________________________________
dense (Dense)                (None, 7)                 2247      
Total params: 13,304,199
Trainable params: 13,304,199
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
dModel.predict(tokenizer, str(input('Enter a sentence : ')))

The emotion predicted is normal


In [18]:
#Machain Learning
mModelDT = DecisionTreeModel(df)
mModelDT.model(X_train_pad, Y_train_f, X_val_pad, Y_val_f)

(('Accuracy Score on train data: ', 0.9687818907134242),
 ('Accuracy Score on test data: ', 0.527586446809991))

In [19]:
mModelLR = LogisticRegressionModel(df)
mModelLR.model(X_train_pad, ytrain, X_val_pad, ytest)

(('Accuracy Score on train data: ', 0.4023796270526023),
 ('Accuracy Score on test data: ', 0.4001252348152787))