In [3]:
import pandas as pd
import numpy as np

import re
import nltk
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC, SVC

import matplotlib.pyplot as plt
import seaborn as sns

from src.preprocess import clean_text

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [25]:
# data = pd.read_csv("data/dev_data.csv")
# data['process_text'] = data['text'].apply(lambda x: clean_text(x))
 
# count = 0
# for doc in nlp.pipe(data['process_text'].str.lower(), batch_size=32, n_process=3, disable=["tagger","parser", "ner"]):
#     data.loc[count, "process_text"] = " ".join([token.lemma_ for token in doc if token not in stop_words])
#     count += 1

data = pd.read_csv("data/lemmatized_dev_data.csv")

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [28]:
# split into train and test dataset. Use only 100k samples to train and use the rest as test
X_train,X_test, y_train, y_test = train_test_split(data['process_text'], data['label'], random_state=0,
                                                   train_size= int(1e5), stratify= data['label'])

### TF IDF Vectorization

In [29]:
# Create a DenseTransformer since TF-IDF vectorization returns sparse matrices
class DenseTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

### Naive Bayes (Multinomial NB)

In [31]:
nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.8, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', MultinomialNB())
])
nb_pipe.fit(X_train, y_train)
y_pred = nb_pipe.predict(X_test)

print(f"Naive Bayes accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Naive Bayes accuracy is: 0.86.

              precision    recall  f1-score   support

       anger       0.93      0.80      0.86     32390
        fear       0.88      0.75      0.81     27463
         joy       0.81      0.97      0.88     78418
        love       0.92      0.56      0.69     18992
     sadness       0.88      0.94      0.91     67437
    surprise       0.89      0.42      0.57      8233

    accuracy                           0.86    232933
   macro avg       0.88      0.74      0.79    232933
weighted avg       0.87      0.86      0.85    232933



### Logistic Regression

In [32]:
logreg_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.8, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
logreg_pipe.fit(X_train, y_train)
y_pred = logreg_pipe.predict(X_test)

print(f"Multi-class Logistic Regression accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))


Multi-class Logistic Regression accuracy is: 0.89.

              precision    recall  f1-score   support

       anger       0.91      0.85      0.88     32390
        fear       0.87      0.81      0.84     27463
         joy       0.87      0.94      0.90     78418
        love       0.83      0.76      0.79     18992
     sadness       0.92      0.93      0.93     67437
    surprise       0.80      0.73      0.76      8233

    accuracy                           0.89    232933
   macro avg       0.87      0.83      0.85    232933
weighted avg       0.89      0.89      0.89    232933



In [33]:
feature_names = logreg_pipe.named_steps['tfidf'].get_feature_names_out()

n_max= 15
res = pd.DataFrame(columns= emotions, index = range(0,n_max,1))

for ii in range(0,len(emotions)):
    res.loc[:,emotions[ii]] = pd.DataFrame(logreg_pipe.named_steps['clf'].estimators_[ii].coef_, columns= feature_names, 
                index= [emotions[ii]]).T.sort_values(emotions[ii], ascending= False).head(n_max).index

res

Unnamed: 0,anger,fear,joy,love,sadness,surprise
0,distracted,apprehensive,acceptable,tender,dull,impressed
1,dangerous,paranoid,resolved,sympathetic,jaded,shocked
2,rude,shaky,smug,horny,groggy,dazed
3,dissatisfied,hesitant,mellow,longing,aching,amazed
4,envious,distressed,sincere,naughty,vain,curious
5,greedy,shaken,clever,delicate,needy,stunned
6,bitchy,frantic,friendly,gentle,gloomy,surprised
7,irritable,fearful,pleasant,nostalgic,sentimental,funny
8,bothered,intimidated,innocent,caring,unfortunate,overwhelmed
9,impatient,terrified,convinced,loyal,abused,strange


In [84]:
fig, axs = plt.subplots(figsize= (15,7), ncols= 3, nrows= 2);
plt.suptitle("Top 10 features for Logistic Regression One-vs-Rest Classifier");

tmp_x_train = logreg_pipe[:-1].transform(X_train.head(1000))

ii = jj = 0;
for ee in range(0,len(emotions)):

    tmp_y_train = (y_train.head(1000) == emotions[ee]).astype(int)
    # to get permutation: 
    results = permutation_importance(logreg_pipe[-1].estimators_[ee], tmp_x_train, tmp_y_train, scoring='recall',
                                    n_repeats= 10, max_samples= 1000, n_jobs= 2, random_state= 0)

    top_features = feature_names[np.argsort(results.importances_mean)][::-1][:10]
    df_imp = pd.melt(pd.DataFrame(results.importances, index= feature_names).reindex(top_features).reset_index(), id_vars= "index")
    df_imp.columns = ['feature','run','importance']


    sns.boxplot(data= df_imp, x= "feature", y= "importance", order= top_features, color= "coral", ax= axs[ii,jj])
    axs[ii,jj].tick_params(labelrotation= 30)
    axs[ii,jj].set_yticks([])
    axs[ii,jj].set_ylabel('')
    axs[ii,jj].set_xlabel('')
    axs[ii,jj].set_title(emotions[ee])

    jj += 1
    if jj > 2:
        jj = 0
        ii += 1
    
    del df_imp

plt.tight_layout();

### Support Vector 
We use the SGDClassifier implementation, which is fast and does not require Bagging.
If we were to use SVC, the implementation scales quadratically with time, and in that case we would want to use BaggingClassfier.

In [19]:
svc_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', SGDClassifier(loss= 'hinge'))
])
svc_pipe.fit(X_train, y_train)
y_pred = svc_pipe.predict(X_test)

print(f"Multi-class SVM accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))


Multi-class SVM accuracy is: 0.88.

              precision    recall  f1-score   support

       anger       0.94      0.82      0.88     13889
        fear       0.86      0.82      0.84     11776
         joy       0.84      0.96      0.90     33625
        love       0.88      0.69      0.77      8144
     sadness       0.92      0.94      0.93     28916
    surprise       0.91      0.63      0.74      3530

    accuracy                           0.88     99880
   macro avg       0.89      0.81      0.84     99880
weighted avg       0.89      0.88      0.88     99880



### Decision Tree Classifier

In [36]:
tree_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', DecisionTreeClassifier(min_samples_split= 0.05))
])
tree_pipe.fit(X_train.head(50000), y_train.head(50000))
y_pred = tree_pipe.predict(X_test)

print(f"Multi-class Logistic Regression accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Multi-class Logistic Regression accuracy is: 0.85.

              precision    recall  f1-score   support

       anger       0.84      0.82      0.83     13889
        fear       0.74      0.87      0.80     11776
         joy       0.86      0.87      0.86     33625
        love       0.78      0.75      0.76      8144
     sadness       0.91      0.88      0.89     28916
    surprise       0.87      0.62      0.72      3530

    accuracy                           0.85     99880
   macro avg       0.83      0.80      0.81     99880
weighted avg       0.85      0.85      0.85     99880



In [7]:
rf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', RandomForestClassifier(n_estimators= 10, max_samples= 5000, min_samples_split= 10))
])
# rf_pipe.fit(X_train, y_train)
# y_pred = rf_pipe.predict(X_test)

rf_grid= GridSearchCV(rf_pipe, param_grid= {'clf__n_estimators': [10,20,100],
                                            'clf__max_samples':[5000],
                                            'clf__min_samples_split': [10, 50, 100, 200, 500]}, scoring= 'accuracy', cv= 3)
rf_grid.fit(X_train, y_train)
print(rf_grid.best_params_)
y_pred = rf_grid.predict(X_test)

print(f"Multi-class Random Forest accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

{'clf__max_samples': 5000, 'clf__min_samples_split': 50, 'clf__n_estimators': 100}
Multi-class Random Forest accuracy is: 0.87.

              precision    recall  f1-score   support

       anger       0.90      0.83      0.86     13889
        fear       0.80      0.83      0.81     11776
         joy       0.84      0.92      0.88     33625
        love       0.83      0.71      0.77      8144
     sadness       0.91      0.91      0.91     28916
    surprise       0.88      0.63      0.73      3530

    accuracy                           0.87     99880
   macro avg       0.86      0.80      0.83     99880
weighted avg       0.87      0.87      0.86     99880



### Grid Search with Gradient Descent Linear SVM

In [17]:
svc_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', SGDClassifier(loss= 'hinge', max_iter= 100, penalty= 'l2'))
])
svc_grid = GridSearchCV(svc_pipe, scoring= 'accuracy', cv=3, verbose= 3, 
                        param_grid= {'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1]})
svc_grid.fit(X_train, y_train)
print(svc_grid.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END .................clf__alpha=0.0001;, score=0.881 total time=  16.7s
[CV 2/3] END .................clf__alpha=0.0001;, score=0.883 total time=  22.7s
[CV 3/3] END .................clf__alpha=0.0001;, score=0.882 total time=  19.4s
[CV 1/3] END ..................clf__alpha=0.001;, score=0.870 total time=  19.3s
[CV 2/3] END ..................clf__alpha=0.001;, score=0.871 total time=  35.2s
[CV 3/3] END ..................clf__alpha=0.001;, score=0.871 total time=  35.8s
[CV 1/3] END ...................clf__alpha=0.01;, score=0.868 total time=  16.3s
[CV 2/3] END ...................clf__alpha=0.01;, score=0.871 total time=  18.6s
[CV 3/3] END ...................clf__alpha=0.01;, score=0.871 total time=  18.2s
[CV 1/3] END ....................clf__alpha=0.1;, score=0.869 total time=  17.3s
[CV 2/3] END ....................clf__alpha=0.1;, score=0.871 total time=  17.5s
[CV 3/3] END ....................clf__alpha=0.1;,

In [18]:
y_pred = svc_grid.predict(X_test)

print(f"Multi-class SVM accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Multi-class SVM accuracy is: 0.88.

              precision    recall  f1-score   support

       anger       0.93      0.83      0.88     13889
        fear       0.87      0.81      0.83     11776
         joy       0.85      0.96      0.90     33625
        love       0.89      0.68      0.77      8144
     sadness       0.91      0.94      0.93     28916
    surprise       0.92      0.62      0.74      3530

    accuracy                           0.88     99880
   macro avg       0.89      0.81      0.84     99880
weighted avg       0.89      0.88      0.88     99880



In [22]:
nb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 1e-3, max_df= 0.6, stop_words= stop_words, ngram_range= (1,2))),
    ('dense', DenseTransformer()),
    ('clf', MultinomialNB())
])
nb_grid = GridSearchCV(nb_pipe, scoring= 'accuracy', cv= 3, verbose= 3, 
                       param_grid= {'tfidf__max_df': [0.5, 0.6, 0.75, 0.8, 0.85, 0.9],
                                    'tfidf__min_df': [1e-5, 1e-4, 1e-3, 1e-2]})
nb_grid.fit(X_train, y_train)
y_pred = nb_grid.predict(X_test)

print(f"Naive Bayes accuracy is: {accuracy_score(y_pred, y_test):0.2f}.\n")
print(classification_report(y_test, y_pred, labels= emotions))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END tfidf__max_df=0.5, tfidf__min_df=1e-05;, score=nan total time=   3.2s
[CV 2/3] END tfidf__max_df=0.5, tfidf__min_df=1e-05;, score=nan total time=   3.3s
[CV 3/3] END tfidf__max_df=0.5, tfidf__min_df=1e-05;, score=nan total time=   3.1s
[CV 1/3] END tfidf__max_df=0.5, tfidf__min_df=0.0001;, score=0.867 total time= 1.5min
[CV 2/3] END tfidf__max_df=0.5, tfidf__min_df=0.0001;, score=0.866 total time= 1.5min
[CV 3/3] END tfidf__max_df=0.5, tfidf__min_df=0.0001;, score=0.870 total time= 1.4min
[CV 1/3] END tfidf__max_df=0.5, tfidf__min_df=0.001;, score=0.860 total time=   7.7s
[CV 2/3] END tfidf__max_df=0.5, tfidf__min_df=0.001;, score=0.861 total time=   7.2s
[CV 3/3] END tfidf__max_df=0.5, tfidf__min_df=0.001;, score=0.862 total time=   7.1s
[CV 1/3] END tfidf__max_df=0.5, tfidf__min_df=0.01;, score=0.359 total time=   5.2s
[CV 2/3] END tfidf__max_df=0.5, tfidf__min_df=0.01;, score=0.359 total time=   5.1s
[CV 3/3] 

18 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\garim\anaconda3\envs\github\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\garim\anaconda3\envs\github\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\garim\anaconda3\envs\github\lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
  File "c:\Users\garim\anaconda3\envs\github\lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, 

Naive Bayes accuracy is: 0.87.

              precision    recall  f1-score   support

       anger       0.94      0.83      0.88     13889
        fear       0.88      0.78      0.83     11776
         joy       0.83      0.97      0.90     33625
        love       0.91      0.57      0.70      8144
     sadness       0.89      0.96      0.92     28916
    surprise       0.91      0.42      0.58      3530

    accuracy                           0.87     99880
   macro avg       0.90      0.76      0.80     99880
weighted avg       0.88      0.87      0.87     99880

