## 1. Importing the libraries

In [1]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import numpy as np
import gensim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\easha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Importing the data

In [2]:
#Training dataset
train=pd.read_csv(r'C:\Users\easha\Desktop\CMU\Fall2020\Intro to AI\HW\HW4\empatheticdialogues\train.csv',error_bad_lines=False)

b'Skipping line 2355: expected 8 fields, saw 10\nSkipping line 36628: expected 8 fields, saw 12\nSkipping line 49433: expected 8 fields, saw 10\nSkipping line 56957: expected 8 fields, saw 10\nSkipping line 65019: expected 8 fields, saw 10\n'


In [3]:
#Testing dataset
test=pd.read_csv(r'C:\Users\easha\Desktop\CMU\Fall2020\Intro to AI\HW\HW4\empatheticdialogues\test.csv',error_bad_lines=False)

b'Skipping line 3: expected 8 fields, saw 9\nSkipping line 5: expected 8 fields, saw 9\nSkipping line 8: expected 8 fields, saw 9\nSkipping line 10: expected 8 fields, saw 9\nSkipping line 12: expected 8 fields, saw 9\nSkipping line 14: expected 8 fields, saw 9\nSkipping line 16: expected 8 fields, saw 9\nSkipping line 18: expected 8 fields, saw 9\nSkipping line 20: expected 8 fields, saw 9\nSkipping line 22: expected 8 fields, saw 9\nSkipping line 25: expected 8 fields, saw 9\nSkipping line 27: expected 8 fields, saw 9\nSkipping line 30: expected 8 fields, saw 9\nSkipping line 32: expected 8 fields, saw 9\nSkipping line 35: expected 8 fields, saw 9\nSkipping line 37: expected 8 fields, saw 9\nSkipping line 39: expected 8 fields, saw 9\nSkipping line 41: expected 8 fields, saw 9\nSkipping line 43: expected 8 fields, saw 9\nSkipping line 45: expected 8 fields, saw 9\nSkipping line 48: expected 8 fields, saw 9\nSkipping line 50: expected 8 fields, saw 9\nSkipping line 53: expected 8 fiel

## 3. Data Transformation

In [4]:
#original shape
print('Train Dataset:', train.shape)
print('Test Dataset:',test.shape)

Train Dataset: (76668, 8)
Test Dataset: (5701, 8)


**We do the following below:**
1. Keep only 2 columns: context and utterance. We remove all other columns
2. For the purposes of this analysis, we will only consider the following list of sentiments: sad, jealous, joyful, terrified. We drop all other sentiment values. 


In [5]:
train=train.loc[train['context'].isin(['sad','jealous','joyful','terrified']),['context','utterance']]
test=test.loc[test['context'].isin(['sad','jealous','joyful','terrified']),['context','utterance']]
#updated shape
print('Train Dataset:',train.shape)
print('Test Dataset:',test.shape)

Train Dataset: (9796, 2)
Test Dataset: (720, 2)


## 4. Data Pre-processing

**In this phase, we do the following:**
1. Remove punctuations
2. Convert words to lowercase
3. Lemmatization

**Additional considerations: We can also perform lemmatization here**

In [6]:
# Remove punctuations
train['utterance_cleaned'] = train['utterance'].map(lambda x: re.sub('[*_,\.!?\']', ' ', x))# https://medium.com/@arunm8489/getting-started-with-natural-language-processing-6e593e349675
# https://machinelearningmastery.com/clean-text-machine-learning-python/

test['utterance_cleaned'] = test['utterance'].map(lambda x: re.sub('[*_,\.!?\']', ' ', x))# https://medium.com/@arunm8489/getting-started-with-natural-language-processing-6e593e349675


In [7]:
# Convert words to lowercase
train['utterance_cleaned']=train['utterance_cleaned'].str.lower()
test['utterance_cleaned']=test['utterance_cleaned'].str.lower()

In [9]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatizeString(valString):
    if valString != None: 
        arrayOfWords = valString.split(' ')
        arrayOfWordsLemm=[lemmatizer.lemmatize(word) for word in arrayOfWords]
        return (" ").join(arrayOfWordsLemm)

In [10]:
#calling the lemmatization function
train['utterance_cleaned']=train['utterance_cleaned'].map(lambda x: lemmatizeString(x))
test['utterance_cleaned']=test['utterance_cleaned'].map(lambda x: lemmatizeString(x))

On some analysis of the lemmatization outputs, I noted that lemmatization does not work if we don't give context i.e. use POS functionality in lemmatization. To tackle this, we can do two things:
1. Either use stemming to simply cut the words to their root forms (even if they aren't valid words in the english language). 
2. Use POS functionality in the lemmatization function above. 

Even though the 2nd option makes more sense, it is more complex to understand and verify. But at the same time, stemming can also lead to creation of words that do not make semantic sense, like converting always to alway, nervous to nervou. For now, I will continue without performing either. 

In [11]:
train_cleaned=train.loc[:,['context','utterance_cleaned']]
test_cleaned=test.loc[:,['context','utterance_cleaned']]

In [12]:
train_cleaned=train_cleaned[train_cleaned['utterance_cleaned']!='']
test_cleaned=test_cleaned[test_cleaned['utterance_cleaned']!='']

**Creating a sparse bag of words**
We can do this in either of the following ways:
1. Create a bag of words using count vectorizer and then convert that to a sparse matrix (OHE)
2. Directly use the sklearn library for one hot encoding

In [14]:
#Before creating a sparse vector
print('*'*5,'Train Dataset','*'*5)
print("**Before creating a sparse vector**\nNumber of features:",train_cleaned.shape[1])

***** Train Dataset *****
**Before creating a sparse vector**
Number of features: 2


In [15]:
#Train
train_count_vectorizer = CountVectorizer()
X = train_count_vectorizer.fit_transform(train_cleaned['utterance_cleaned'])
encoding = X.toarray()

In [16]:
#After creating a sparse vector
print("After creating a sparse vector:\n Number of features (including the response variable):",encoding.shape[1]+1)

After creating a sparse vector:
 Number of features (including the response variable): 6989


**We can see that using a count vectorizer led to a explosion in the dimensionality of the dataset from 2 features (one of which is the label), to 11000 features (again, one of which is the label).**

In [17]:
X_train=pd.DataFrame(encoding,columns=train_count_vectorizer.get_feature_names())
Y_train=train_cleaned['context']

**Problem with this approach:**
1. Without removing the stop words, we noticed here that words like 'is', 'and' that occur a lot and have no meaning/ contribution to our problem
2. High dimensionality of the feature set
3. We weight each word the same. This means that not just the stop words, but also the rare words (occur very rarely but may have high value/importance) and frequent words (words that occur fairly enough times and has some value for our task.

**For this we will do the following:**
1. Remove stop words and remake the BOW matrix of features
2. Perform tf-idf vectorization

### Step 1: Remove stop words and remake the matrix of BOW Features

Important Note: The word comma occurs frequently and has no contribution to our problem/solution for that matter. Therefore, we can add this to the list of stop words. 

In [18]:
stopwords_list = list(set(stopwords.words('english')))
stopwords_list.extend(['comma', ''])  

In [19]:
# Remove stop words
def removeStopWords(valString):
    arrayOfWords=valString.split(" ") #splitting the sentence into an array of words
    arrayOfWordsCleaned=[word for word in arrayOfWords if word not in stopwords_list]
    return (' ').join(arrayOfWordsCleaned) #joining the cleaned words into a sentence separated by spaces. 

In [20]:
#calling the function to remove these words
train_cleaned['utterance_cleaned']=train_cleaned['utterance_cleaned'].map(lambda x: removeStopWords(x))
test_cleaned['utterance_cleaned']=test_cleaned['utterance_cleaned'].map(lambda x: removeStopWords(x))

In [21]:
#using count vectorizer again to produce the vector of features
train_count_vectorizer2 = CountVectorizer()
X2 = train_count_vectorizer2.fit_transform(train_cleaned['utterance_cleaned'])
encoding2 = X2.toarray()



X2_test = train_count_vectorizer2.transform(test_cleaned['utterance_cleaned'])
encoding2_test = X2_test.toarray()

In [22]:
X_train2=pd.DataFrame(encoding2,columns=train_count_vectorizer2.get_feature_names())
Y_train2=train_cleaned['context']

X_test2=pd.DataFrame(encoding2_test,columns=train_count_vectorizer2.get_feature_names())
Y_test2=test_cleaned['context']

In [23]:
print('*'*5,'Train','*'*5)
print('X:',X_train2.shape,',Y:',Y_train2.shape)

print('*'*5,'Test','*'*5)
print('X:',X_test2.shape,',Y:',Y_test2.shape)

***** Train *****
X: (9796, 6891) ,Y: (9796,)
***** Test *****
X: (720, 6891) ,Y: (720,)


In [24]:
print("Removing the stop words led to a decrease in feature dimensionality by",X_train.shape[1]-X_train2.shape[1],"features in the training set.")

Removing the stop words led to a decrease in feature dimensionality by 97 features in the training set.


### Step 2: Perform TF-IDF vectorization

In [25]:
#train
train_tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
train_embedding_tfidf_transformer = train_tfidf_transformer.fit_transform(X2) #passing the count vector transformed object 
encoding3=train_embedding_tfidf_transformer.toarray()

#test
test_embedding_tfidf_transformer = train_tfidf_transformer.transform(X2_test) #passing the count vector transformed object 
encoding3_test=test_embedding_tfidf_transformer.toarray()

In [26]:
#final TF-IDF train set ready
X_train3=pd.DataFrame(encoding3,columns=train_count_vectorizer2.get_feature_names())
Y_train3=train_cleaned['context']

#final TF-IDF test set ready
X_test3=pd.DataFrame(encoding3_test,columns=train_count_vectorizer2.get_feature_names(),index=test_cleaned.index)
Y_test3=test_cleaned['context']

In [27]:
print('*'*5,'Train','*'*5)
print('X:',X_train3.shape,',Y:',Y_train3.shape)
print('*'*5,'Test','*'*5)
print('X:',X_test3.shape,',Y:',Y_test3.shape)

***** Train *****
X: (9796, 6891) ,Y: (9796,)
***** Test *****
X: (720, 6891) ,Y: (720,)


## Stochastic Gradient Descent Model Training and Testing

In [28]:
#fitting a stochastic gradient descent model
clf=SGDClassifier(max_iter=1000, penalty='l1')
clf.fit(X_train3,Y_train3)
test_predicted_labels=clf.predict(X_test3)

In [29]:
print('Test accuracy :', np.mean(Y_test3 == test_predicted_labels))
f1_score_vector = f1_score(Y_test3, test_predicted_labels, average=None)
print('F1 score :', np.mean(Y_test3 == test_predicted_labels))
print('Confusion matrix :', confusion_matrix(Y_test3, test_predicted_labels))
print('f1 score using SGD classifier is :', np.mean(f1_score_vector))

Test accuracy : 0.6361111111111111
F1 score : 0.6361111111111111
Confusion matrix : [[106  33  25  19]
 [ 18 127  28  14]
 [ 28  31 112  24]
 [ 11  16  15 113]]
f1 score using SGD classifier is : 0.6375261906212876


In [30]:
#Printing confusion matrix as a dataframe
conf=pd.DataFrame(confusion_matrix(Y_test3, test_predicted_labels), index=clf.classes_, columns=clf.classes_)

In [31]:
totalSum=conf.sum(axis=1)
totalSum

jealous      183
joyful       187
sad          195
terrified    155
dtype: int64

In [32]:
for i in conf.index:
    conf.loc[i,:]/=totalSum[i]

In [33]:
conf

Unnamed: 0,jealous,joyful,sad,terrified
jealous,0.579235,0.180328,0.136612,0.103825
joyful,0.096257,0.679144,0.149733,0.074866
sad,0.14359,0.158974,0.574359,0.123077
terrified,0.070968,0.103226,0.096774,0.729032


**Interpreting the confusion matrix:**
We note that the model does an okay job (subjective), since we see that the true positives are correctly identified as true positive 54%-80% of the times (depending the label). The classification of the label 'terrified' is the most accurate (correctly identifying 80% of Positives as Positives) and that of 'jealous' is the least (identifying 56% of the Positives as Positives). 

In [34]:
#examples of misclassification
labelTable=pd.DataFrame({'Test Prediction':test_predicted_labels,'Actual Label':Y_test3}, index=Y_test3.index)
misclassification_cases=labelTable.loc[labelTable['Test Prediction']!=labelTable['Actual Label'],:]

In [35]:
misclassification_cases

Unnamed: 0,Test Prediction,Actual Label
10,joyful,sad
11,terrified,sad
81,jealous,sad
95,jealous,sad
96,jealous,sad
...,...,...
5613,jealous,sad
5614,jealous,sad
5648,joyful,jealous
5666,terrified,jealous


In [36]:
listOfMisclassifiedIndex=list(misclassification_cases.index)

In [37]:
word2vec_Misclassified=pd.DataFrame(X_test3,index=listOfMisclassifiedIndex)

In [38]:
originalDataMisclassified=pd.DataFrame(test_cleaned,index=listOfMisclassifiedIndex)

In [39]:
originalDataMisclassified['Prediction']=misclassification_cases['Test Prediction']

In [40]:
originalDataMisclassified.head(10)

Unnamed: 0,context,utterance_cleaned,Prediction
10,sad,wa born premature home hard time breathing ins...,joyful
11,sad,yes believe god prayer goodness gracious pleas...,terrified
81,sad,wa around 11 took hard went ahead let stay hom...,jealous
95,sad,one saddest thing people underestimate and/or ...,jealous
96,sad,perfectly natural sound like kind person thoug...,jealous
111,jealous,met old flame recently go expected,sad
113,jealous,know right warning anything turned nice guy th...,terrified
160,joyful,ha true ;-) also year bug life came always tho...,terrified
183,sad,today went outside check new banana plant noti...,joyful
184,sad,think problem worried drainage issue going che...,terrified


In [41]:
listOfMisclassifiedUtterance=list(originalDataMisclassified['utterance_cleaned'])
listOfMisclassifiedUtterance[:10]

['wa born premature home hard time breathing instead taking doctor parent praying',
 'yes believe god prayer goodness gracious please take child hospital let god heal doctor',
 'wa around 11 took hard went ahead let stay home school knew well day',
 'one saddest thing people underestimate and/or capable',
 'perfectly natural sound like kind person though quickly regained bearing disappointment like usually come good lesson',
 'met old flame recently go expected',
 'ha true ;-) also year bug life came always thought young one "dot" spunky little girl ant movie',
 'today went outside check new banana plant noticed new leaf wa turning brown',
 'think problem worried drainage issue going check back couple day']

It's difficult to identify at an immediate glance what the issue in terms of why the model made wrong predictions in these cases. Taking a few examples :
1. **Example 1**: Last value in the dataframe above is actually sad, but classified as terrified. The utterance is as follows: 'think problem worried drainage issue going check back couple day'. It classified this label as terrified possibly due to the presence of certain words like worried, problem, issue that indicate a connotation more negative than sad i.e. terrified. 
2. **Example 2**: Index 160 in the dataframe above, where the context was actually joyful, but was predicted as terrified. The utterance was : 'ha true ;-) also year bug life came always thought young one "dot" spunky little girl ant movie'. This could be because the model failed to recognize the smiley and it's context/meaning. 

Problem is that our model fails to take into consideration context i.e. words before and after the word in consideration. This is where word2vec can help us. It can try to identify word from context or context from word. This will help improve our model. 

## Word2Vec

Now, we will perform transformation of data using word2vec word embeddings. We will then use this data as input to train and test our Multi layer perceptron model. 

In [42]:
# Loading the pretrained word2vec model from Google
word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [43]:
train_cleaned=train_cleaned[train_cleaned['utterance_cleaned']!='']
test_cleaned=test_cleaned[test_cleaned['utterance_cleaned']!='']

In [44]:
def word2vec_transformation(df):
    w2v_processed_text = pd.DataFrame((df['utterance_cleaned'].apply(lambda x: [word2vec[word] if word in word2vec else np.zeros(300) for word in x.split()])).apply(lambda x: sum(x)/len(x)).apply(lambda x: np.array(x)).values.tolist(),index=df.index)
    return w2v_processed_text

In [45]:
Xtrain_data_word2vec=pd.DataFrame(word2vec_transformation(train_cleaned))
Xtest_data_word2vec=pd.DataFrame(word2vec_transformation(test_cleaned))

In [46]:
Y_train_word2vec= train_cleaned['context']
Y_test_word2vec= test_cleaned['context']

In [47]:
mlp_clf=MLPClassifier()
mlp_clf.fit(Xtrain_data_word2vec,Y_train_word2vec)
test_predicted_labels_mlp=mlp_clf.predict(Xtest_data_word2vec)

In [48]:
print('Test accuracy :', np.mean(Y_test_word2vec == test_predicted_labels_mlp))
f1_score_vector = f1_score(Y_test_word2vec, test_predicted_labels_mlp, average=None)
print('F1 score :', np.mean(Y_test_word2vec == test_predicted_labels_mlp))
print('f1 score using MLP classifier is :', np.mean(f1_score_vector))

Test accuracy : 0.6416666666666667
F1 score : 0.6416666666666667
f1 score using MLP classifier is : 0.6424266242085908


In [49]:
#Printing confusion matrix as a dataframe
conf=pd.DataFrame(confusion_matrix(Y_test_word2vec, test_predicted_labels_mlp), index=mlp_clf.classes_, columns=mlp_clf.classes_)

In [50]:
#Confusion matrix
conf

Unnamed: 0,jealous,joyful,sad,terrified
jealous,102,39,23,19
joyful,18,112,36,21
sad,21,23,131,20
terrified,9,11,18,117
