In [2]:
import pandas as pd
import numpy as np

import re
import nltk
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [3]:
data = pd.read_csv("data/lemmatized_dev_data.csv")

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [4]:
X_train,X_test, y_train, y_test = train_test_split(data['process_text'], data['label'], random_state=0,
                                                   test_size= 0.9, stratify= data['label'])

### TF IDF Vectorization

In [5]:
# Create a DenseTransformer since TF-IDF vectorization returns sparse matrices
class DenseTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

### Naive Bayes (Multinomial NB)

In [11]:
nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', MultinomialNB())
])
nb_pipe.fit(X_train, y_train)
y_pred = nb_pipe.predict(X_test)

print(f"Naive Bayes accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Naive Bayes accuracy is: 0.84.

              precision    recall  f1-score   support

       anger       0.93      0.77      0.84     41666
        fear       0.87      0.71      0.78     35328
         joy       0.79      0.96      0.87    100875
        love       0.92      0.50      0.65     24431
     sadness       0.85      0.94      0.89     86749
    surprise       0.94      0.32      0.48     10591

    accuracy                           0.84    299640
   macro avg       0.88      0.70      0.75    299640
weighted avg       0.85      0.84      0.83    299640



### Logistic Regression

In [12]:
logreg_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.75, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
logreg_pipe.fit(X_train, y_train)
y_pred = logreg_pipe.predict(X_test)

print(f"Multi-class Logistic Regression accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))


Multi-class Logistic Regression accuracy is: 0.88.

              precision    recall  f1-score   support

       anger       0.91      0.83      0.87     41666
        fear       0.87      0.78      0.82     35328
         joy       0.85      0.95      0.90    100875
        love       0.85      0.72      0.78     24431
     sadness       0.90      0.93      0.92     86749
    surprise       0.85      0.64      0.73     10591

    accuracy                           0.88    299640
   macro avg       0.87      0.81      0.84    299640
weighted avg       0.88      0.88      0.87    299640



In [13]:
feature_names = logreg_pipe.named_steps['tfidf'].get_feature_names_out()

n_max= 15
res = pd.DataFrame(columns= emotions, index = range(0,n_max,1))

for ii in range(0,len(emotions)):
    res.loc[:,emotions[ii]] = pd.DataFrame(logreg_pipe.named_steps['clf'].estimators_[ii].coef_, columns= feature_names, 
                index= [emotions[ii]]).T.sort_values(emotions[ii], ascending= False).head(n_max).index

res

Unnamed: 0,anger,fear,joy,love,sadness,surprise
0,mad,apprehensive,resolved,tender,dull,impressed
1,dangerous,paranoid,acceptable,longing,aching,shocked
2,bothered,pressured,trusting,nostalgic,groggy,curious
3,envious,shaky,worthwhile,sympathetic,jaded,surprised
4,distracted,reluctant,clever,naughty,shitty,dazed
5,rude,intimidated,smug,caring,homesick,funny
6,dissatisfied,hesitant,friendly,romantic,vain,amazed
7,violent,terrified,innocent,gentle,discouraged,stunned
8,petty,unsure,sincere,supportive,needy,strange
9,bitter,vulnerable,convinced,horny,gloomy,weird


In [54]:
tmp_x_train = logreg_pipe[:-1].transform(X_train)
# to get permutation: 
results = permutation_importance(logreg_pipe[-1], tmp_x_train, y_train, scoring='accuracy',
                                 n_repeats= 1)
# # get important features:
# important_features = results.importances_mean
# # list all features:
# # for i,v in enumerate(important_features):
# #     print('Feature: %0d, Score: %.5f' % (i,v))

KeyboardInterrupt: 