In [None]:
import gensim.downloader as api
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import json
import pandas as pd
import numpy as np
import warnings

In [None]:
with open("/content/goemotions.json") as f:
    data = json.load(f)

# importing the json file into a dataframe 
df = pd.DataFrame(data, columns = ['Post', 'Emotion', 'Sentiment'])

In [None]:
# Making lists out of the columns from dataframe
postsDict = df['Post'].values.tolist()
emotionDict = df['Emotion'].values.tolist()
sentimentDict = df['Sentiment'].values.tolist()

In [None]:
# 3.1 
corpusNews = api.load('word2vec-google-news-300')



In [None]:
tokenCount = 0
tokenPost = []

for x in postsDict:
  tokens = word_tokenize(x)
  tokenPost.append(tokens)
  tokenCount = tokenCount + len(tokens)
# 3.2
print(f'The total number of tokens is {tokenCount}')

The total number of tokens is 2642128


In [None]:
# fail counter for punctuation and words not found in corpus 
failed = 0
averageEmb = []
averageEmbAll = []

# Take the X TokenPost
for index, iterator in enumerate(tokenPost):
  for x in iterator:
    try:
      # Assign Corpus[x] to single word
      singleWord = corpusNews[x]
      # New List append that word
      averageEmb.append(singleWord)
    except KeyError:
      # Doesn't find throw error increment counter
      failed += 1
  if len(averageEmb) != 0:
    average = np.average(averageEmb, axis = 0)
    averageEmbAll.append(average)
    averageEmb.clear()
  else: 
    # Set the vector to 0 
    averageEmbAll.append([0] * 300)
# 3.3
print(f'The failed count is: {failed}')

# 3.4 
hitRate = ((tokenCount - failed) / tokenCount) *100
print(f'The hit rate is as follows: {hitRate}')

The failed count is: 595783
The hit rate is as follows: 77.45063827339175


In [None]:
# Open Text Word doc for Writing for BASE
performanceTxt = open("performanceTxt.text", "w")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting data 80% train 20% test
post_train, post_test, emotion_train, emotion_test, sentiment_train, sentiment_test = train_test_split = train_test_split(averageEmbAll, emotionDict, sentimentDict, test_size = 0.2)

In [None]:
# 3.5 Base-MLP with the default parameters
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(early_stopping=True)
%time mlp.fit(post_train, emotion_train)
%time mlp.fit(post_train, sentiment_train)

CPU times: user 1min 50s, sys: 55 s, total: 2min 45s
Wall time: 1min 27s
CPU times: user 1min 50s, sys: 54.1 s, total: 2min 44s
Wall time: 1min 25s


MLPClassifier(early_stopping=True)

In [None]:
from sklearn import metrics

In [None]:
# Emotion and Sentiment prediction for BASE 
emotion_pred = mlp.fit(post_train, emotion_train).predict(post_test)
sentiment_pred = mlp.fit(post_train, sentiment_train).predict(post_test)

In [None]:
# Write to Text Word Doc the Accuracy and the Classification Report for BASE 
performanceTxt.write(f" The following is the information for the Base MLP Classifier \n  " )
performanceTxt.write(f" This is the Accuracy of the Tests for the emotion model \n {mlp.score(post_test, emotion_test) * 100} \n  " )
performanceTxt.write(f" This is the Accuracy of the Tests for the sentiment model \n {mlp.score(post_test, sentiment_test) * 100} \n  " )
performanceTxt.write(f" This is the Classification Report for the emotion model \n {metrics.classification_report(emotion_test, emotion_pred,labels=np.unique(emotion_test), zero_division=0)} \n  " )
performanceTxt.write(f" This is the Classification Report for the sentiment model \n {metrics.classification_report(sentiment_test, sentiment_pred,labels=np.unique(sentiment_test), zero_division=0)} \n  " )
print("Wrote everything to the file good job!")

Wrote everything to the file good job!


In [None]:
# 3.6 Top-MLP using GridSearchCV
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
from sklearn.model_selection import GridSearchCV
param = {'activation' : ['logistic', 'tanh', 'relu', 'identity'], 'hidden_layer_sizes' : [(10,30,10), (50, 30)], 'solver' : ['adam', 'sgd']}
mlp_gscv = GridSearchCV(MLPClassifier(early_stopping=True, max_iter=15, verbose=True), param)
%time mlp_gscv.fit(post_train, emotion_train)
%time mlp_gscv.fit(post_train, sentiment_train)

Iteration 1, loss = 2.84003125
Validation score: 0.330363
Iteration 2, loss = 2.71751437
Validation score: 0.330363
Iteration 3, loss = 2.70527536
Validation score: 0.330363
Iteration 4, loss = 2.61776422
Validation score: 0.344821
Iteration 5, loss = 2.56422390
Validation score: 0.347367
Iteration 6, loss = 2.54041863
Validation score: 0.348186
Iteration 7, loss = 2.51928924
Validation score: 0.349095
Iteration 8, loss = 2.50405409
Validation score: 0.351096
Iteration 9, loss = 2.49078832
Validation score: 0.351641
Iteration 10, loss = 2.47880161
Validation score: 0.352278
Iteration 11, loss = 2.46730088
Validation score: 0.354006
Iteration 12, loss = 2.45710518
Validation score: 0.355370
Iteration 13, loss = 2.44796014
Validation score: 0.356734
Iteration 14, loss = 2.43981158
Validation score: 0.359371
Iteration 15, loss = 2.43276500
Validation score: 0.360553
Iteration 1, loss = 2.87070120
Validation score: 0.320178
Iteration 2, loss = 2.71605607
Validation score: 0.320178
Iteratio

GridSearchCV(estimator=MLPClassifier(early_stopping=True, max_iter=15,
                                     verbose=True),
             param_grid={'activation': ['logistic', 'tanh', 'relu', 'identity'],
                         'hidden_layer_sizes': [(10, 30, 10), (50, 30)],
                         'solver': ['adam', 'sgd']})

In [None]:
# Emotion and Sentiment prediction for TOP 
emotion_pred = mlp_gscv.fit(post_train, emotion_train).predict(post_test)
sentiment_pred = mlp_gscv.fit(post_train, sentiment_train).predict(post_test)

Iteration 1, loss = 2.82949095
Validation score: 0.322452
Iteration 2, loss = 2.71665939
Validation score: 0.322452
Iteration 3, loss = 2.71313658
Validation score: 0.322452
Iteration 4, loss = 2.68847530
Validation score: 0.322452
Iteration 5, loss = 2.62642297
Validation score: 0.334546
Iteration 6, loss = 2.57916376
Validation score: 0.336910
Iteration 7, loss = 2.55652699
Validation score: 0.340184
Iteration 8, loss = 2.54148749
Validation score: 0.340729
Iteration 9, loss = 2.52415925
Validation score: 0.342093
Iteration 10, loss = 2.50769185
Validation score: 0.344367
Iteration 11, loss = 2.49421599
Validation score: 0.345094
Iteration 12, loss = 2.48220457
Validation score: 0.346276
Iteration 13, loss = 2.47091252
Validation score: 0.348641
Iteration 14, loss = 2.45981777
Validation score: 0.351369
Iteration 15, loss = 2.45027904
Validation score: 0.351641
Iteration 1, loss = 2.83703532
Validation score: 0.314631
Iteration 2, loss = 2.71367050
Validation score: 0.314631
Iteratio

In [None]:
# Open Text Word doc for Writing for TOP
performanceTop = open("performanceTop.txt", "w")

In [None]:
# Write to Text Word Doc the Accuracy and the Classification Report for TOP 
performanceTop.write(f" The following is the information for the Top MLP Classifier \n  " )
performanceTop.write(f" This is the Accuracy of the Tests for the emotion model \n {mlp.score(post_test, emotion_test) * 100} \n  " )
performanceTop.write(f" This is the Accuracy of the Tests for the sentiment model \n {mlp.score(post_test, sentiment_test) * 100} \n  " )
performanceTop.write(f" This is the Classification Report for the emotion model \n {metrics.classification_report(emotion_test, emotion_pred, labels=np.unique(emotion_test), zero_division=0)} \n  " )
performanceTop.write(f" This is the Classification Report for the sentiment model \n {metrics.classification_report(sentiment_test, sentiment_pred, labels=np.unique(sentiment_test), zero_division=0)} \n  " )
print("Wrote everything to the file good job!")

Wrote everything to the file good job!
