In [2]:
import pandas as pd
import numpy as np

import re
import nltk
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [3]:
data = pd.read_csv("data/lemmatized_dev_data.csv")

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [14]:
X_train,X_test, y_train, y_test = train_test_split(data['process_text'], data['label'], random_state=0,
                                                   test_size= 0.3, stratify= data['label'])

### TF IDF Vectorization

In [15]:
# Create a DenseTransformer since TF-IDF vectorization returns sparse matrices
class DenseTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

### Naive Bayes (Multinomial NB)

In [16]:
nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', MultinomialNB())
])
nb_pipe.fit(X_train, y_train)
y_pred = nb_pipe.predict(X_test)

print(f"Naive Bayes accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Naive Bayes accuracy is: 0.86.

              precision    recall  f1-score   support

       anger       0.94      0.81      0.87     13889
        fear       0.88      0.75      0.81     11776
         joy       0.82      0.96      0.88     33625
        love       0.91      0.57      0.70      8144
     sadness       0.88      0.95      0.91     28916
    surprise       0.88      0.46      0.61      3530

    accuracy                           0.86     99880
   macro avg       0.88      0.75      0.80     99880
weighted avg       0.87      0.86      0.86     99880



### Logistic Regression

In [20]:
logreg_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
logreg_pipe.fit(X_train, y_train)
y_pred = logreg_pipe.predict(X_test)

print(f"Multi-class Logistic Regression accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))


Multi-class Logistic Regression accuracy is: 0.89.

              precision    recall  f1-score   support

       anger       0.91      0.86      0.88     13889
        fear       0.86      0.80      0.83     11776
         joy       0.87      0.94      0.90     33625
        love       0.82      0.77      0.80      8144
     sadness       0.93      0.93      0.93     28916
    surprise       0.79      0.73      0.76      3530

    accuracy                           0.89     99880
   macro avg       0.87      0.84      0.85     99880
weighted avg       0.89      0.89      0.89     99880



In [21]:
feature_names = logreg_pipe.named_steps['tfidf'].get_feature_names_out()

n_max= 15
res = pd.DataFrame(columns= emotions, index = range(0,n_max,1))

for ii in range(0,len(emotions)):
    res.loc[:,emotions[ii]] = pd.DataFrame(logreg_pipe.named_steps['clf'].estimators_[ii].coef_, columns= feature_names, 
                index= [emotions[ii]]).T.sort_values(emotions[ii], ascending= False).head(n_max).index

res

Unnamed: 0,anger,fear,joy,love,sadness,surprise
0,dissatisfied,apprehensive,acceptable,tender,groggy,dazed
1,greedy,shaken,resolved,sympathetic,disheartened,shocked
2,dangerous,frantic,mellow,horny,needy,stunned
3,envious,hesitant,sincere,delicate,abused,impressed
4,irritable,pressured,smug,naughty,discontent,amazed
5,distracted,distressed,triumphant,longing,jaded,curious
6,rebellious,reluctant,trusting,loyal,vain,surprised
7,petty,fearful,casual,gentle,gloomy,funny
8,bothered,intimidated,virtuous,devoted,homesick,overwhelmed
9,bitchy,shaky,intelligent,caring,rotten,amazing


In [19]:
tmp_x_train = logreg_pipe[:-1].transform(X_train)
# to get permutation: 
results = permutation_importance(logreg_pipe[-1], tmp_x_train, y_train, scoring='accuracy',
                                 n_repeats= 1)
# # get important features:
# important_features = results.importances_mean
# # list all features:
# # for i,v in enumerate(important_features):
# #     print('Feature: %0d, Score: %.5f' % (i,v))

KeyboardInterrupt: 