In [None]:
import gensim.downloader as api
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
import json
import pandas as pd
import numpy as np
import warnings

In [None]:
with open("/content/goemotions.json") as f:
    data = json.load(f)

# importing the json file into a dataframe 
df = pd.DataFrame(data, columns = ['Post', 'Emotion', 'Sentiment'])

In [None]:
# Making lists out of the columns from dataframe
postsDict = df['Post'].values.tolist()
emotionDict = df['Emotion'].values.tolist()
sentimentDict = df['Sentiment'].values.tolist()

In [None]:
# 3.1 
corpusNews = api.load('word2vec-google-news-300')

In [None]:
tokenCount = 0
tokenPost = []

for x in postsDict:
  tokens = word_tokenize(x)
  tokenPost.append(tokens)
  tokenCount = tokenCount + len(tokens)
# 3.2
print(f'The total number of tokens is {tokenCount}')

In [None]:
# fail counter for punctuation and words not found in corpus 
failed = 0
averageEmb = []
averageEmbAll = []

# Take the X TokenPost
for index, iterator in enumerate(tokenPost):
  for x in iterator:
    try:
      # Assign Corpus[x] to single word
      singleWord = corpusNews[x]
      # New List append that word
      averageEmb.append(singleWord)
    except KeyError:
      # Doesn't fnd throw error increment counter
      failed += 1
  if len(averageEmb) != 0:
    average = np.average(averageEmb, axis = 0)
    averageEmbAll.append(average)
    averageEmb.clear()
  else: 
    # Set the vector to 0 
    averageEmbAll.append([0] * 300)
# 3.3
print(f'The failed count is: {failed}')

# 3.4 
hitRate = ((tokenCount - failed) / tokenCount) *100
print(f'The hit rate is as follows: {hitRate}')

In [None]:
# Open Text Word doc for Writing for BASE
performanceTxt = open("performanceTxt.text", "w")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting data 80% train 20% test
post_train, post_test, emotion_train, emotion_test, sentiment_train, sentiment_test = train_test_split = train_test_split(averageEmbAll, emotionDict, sentimentDict, test_size = 0.2)

In [None]:
# 3.5 Base-MLP with the default parameters
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(early_stopping=True)
%time mlp.fit(post_train, emotion_train)
%time mlp.fit(post_train, sentiment_train)

In [None]:
from sklearn import metrics

In [None]:
# Emotion and Sentiment prediction for BASE 
emotion_pred = mlp.fit(post_train, emotion_train).predict(post_test)
sentiment_pred = mlp.fit(post_train, sentiment_train).predict(post_test)

In [None]:
# Write to Text Word Doc the Accuracy and the Classification Report for BASE 
performanceTxt.write(f" The following is the information for the Base MLP Classifier \n  " )
performanceTxt.write(f" This is the Accuracy of the Tests for the emotion model \n {mlp.score(post_test, emotion_test) * 100} \n  " )
performanceTxt.write(f" This is the Accuracy of the Tests for the sentiment model \n {mlp.score(post_test, sentiment_test) * 100} \n  " )
performanceTxt.write(f" This is the Classification Report for the emotion model \n {metrics.classification_report(emotion_test, emotion_pred,labels=np.unique(emotion_test), zero_division=0)} \n  " )
performanceTxt.write(f" This is the Classification Report for the sentiment model \n {metrics.classification_report(sentiment_test, sentiment_pred,labels=np.unique(sentiment_test), zero_division=0)} \n  " )

In [None]:
# 3.6 Top-MLP using GridSearchCV
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
from sklearn.model_selection import GridSearchCV
param = {'activation' : ['logistic', 'tanh', 'relu', 'identity'], 'hidden_layer_sizes' : [(10,30,10), (50, 30)], 'solver' : ['adam', 'sgd']}
mlp_gscv = GridSearchCV(MLPClassifier(early_stopping=True, max_iter=15, verbose=True), param)
%time mlp_gscv.fit(post_train, emotion_train)
%time mlp_gscv.fit(post_train, sentiment_train)

In [None]:
# Emotion and Sentiment prediction for TOP 
emotion_pred = mlp_gscv.fit(post_train, emotion_train).predict(post_test)
sentiment_pred = mlp_gscv.fit(post_train, sentiment_train).predict(post_test)

In [None]:
# Open Text Word doc for Writing for TOP
performanceTop = open("performanceTop.txt", "w")

In [None]:
# Write to Text Word Doc the Accuracy and the Classification Report for TOP 
performanceTop.write(f" The following is the information for the Top MLP Classifier \n  " )
performanceTop.write(f" This is the Accuracy of the Tests for the emotion model \n {mlp.score(post_test, emotion_test) * 100} \n  " )
performanceTop.write(f" This is the Accuracy of the Tests for the sentiment model \n {mlp.score(post_test, sentiment_test) * 100} \n  " )
performanceTop.write(f" This is the Classification Report for the emotion model \n {metrics.classification_report(emotion_test, emotion_pred, labels=np.unique(emotion_test), zero_division=0)} \n  " )
performanceTop.write(f" This is the Classification Report for the sentiment model \n {metrics.classification_report(sentiment_test, sentiment_pred, labels=np.unique(sentiment_test), zero_division=0)} \n  " )