In [93]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import re
import nltk
# nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC, SVC

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import Word2Vec, KeyedVectors

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [2]:
# Clean text
def clean_text(text):

    # remove punctuation
    reg_punc =re.compile(r'[^\w\s]')
    text = reg_punc.sub(r'', text)

    # remove html
    reg_html = re.compile(r'<.*?>')
    text = reg_html.sub(r'', text)

    # remove url
    reg_url = re.compile(r'http\S+')
    text = reg_url.sub(r'', text)

    # remove numerical values
    reg_num = re.compile(r'[0-9]')
    text = reg_num.sub(r'', text)

    # remove special characters
    reg_spcl = re.compile('[@_!#$%^&*()<>?/\\|}{~:]')
    text = reg_spcl.sub(r'', text)

    # remove emoji
    emoji_url = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_url.sub(r'', text)

    return text

In [3]:
data = pd.read_csv("data/dev_data.csv")
data['text'] = data['text'].apply(lambda x: clean_text(x))

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [5]:
X_train,X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=0,
                                                   test_size= 0.3, stratify= data['label'])

In [166]:
def preprocess(X):
    w2v_data = []
    for tt in X:
        w2v_data.append([ww for ww in word_tokenize(tt.lower()) if ww not in stop_words])
    return w2v_data

In [170]:
# w2v_train = preprocess(X_train)
# w2v_test = preprocess(X_test)

model = Word2Vec(w2v_train, min_count = 1, window = 2, vector_size= 300)
model.wv.most_similar('affectionate')

[('compassionate', 0.9249340295791626),
 ('hostile', 0.9108782410621643),
 ('sympathetic', 0.8977436423301697),
 ('benevolent', 0.8855224251747131),
 ('considerate', 0.8787329792976379),
 ('spiteful', 0.8751549124717712),
 ('hateful', 0.8732966780662537),
 ('unfriendly', 0.8729376196861267),
 ('submissive', 0.8725266456604004),
 ('needy', 0.853763222694397)]

In [168]:
goog_wordvecs = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=50000)
print([key for key, value in goog_wordvecs.key_to_index.items() if 'affection' in key])
goog_wordvecs.most_similar('affectionate')

['affection', 'affectionate', 'affectionately']


[('playful', 0.6595738530158997),
 ('loving', 0.6079658269882202),
 ('endearing', 0.6056519746780396),
 ('affection', 0.5745200514793396),
 ('lovable', 0.5493666529655457),
 ('gentle', 0.5466917753219604),
 ('good_natured', 0.5455296039581299),
 ('charming', 0.5257826447486877),
 ('sarcastic', 0.523388147354126),
 ('easygoing', 0.5212149620056152)]

In [174]:
# Create a Word2Vec Transformer
class W2VEmbeddings(TransformerMixin):
    def __init__(self, w2v_model= None, weights = None, max_len= None):
        self.w2v_model = w2v_model
        self.weights = weights
        self.word2weight = None

        # add max len parameter
        if max_len is not None:
            self.max_len = max_len
        elif self.w2v_model is not None:
            self.max_len = self.w2v_model.vector_size

    def fit(self, X, y=None, **fit_params):
        w2v_X = preprocess(X)

        if self.w2v_model is None:
            print("Fitting new Word2Vec model on training data.")
            w2v_model = Word2Vec(w2v_X, min_count = 1, window = 3, vector_size= 100)
            self.w2v_model = w2v_model.wv
            self.max_len = self.w2v_model.vector_size
            print("Done!")
            
        if self.weights == "idf":
            tfidf = TfidfVectorizer(analyzer= lambda x: x)
            tfidf.fit(w2v_X)
            # if a word was never seen - it must be at least as infrequent as any of the known words
            # so the default idf is the max of known idf's
            max_idf = max(tfidf.idf_)
            self.word2weight = defaultdict(
                lambda: max_idf,
                [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
            print("Fit the IDF Model")
        else:
            self.word2weight = defaultdict(lambda: 1)
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_w2v = []
        for vv in preprocess(X):
            X_w2v.append(np.mean([self.w2v_model[w] * self.word2weight[w] for w in vv if w in self.w2v_model.key_to_index.keys()] 
                    or [np.zeros(self.max_len)], axis= 0))
        return X_w2v


In [175]:
test_w2v = W2VEmbeddings(weights= 'idf', w2v_model= goog_wordvecs).fit_transform(X_train[:10])
pd.DataFrame(cosine_similarity(test_w2v), index = y_train[:10], columns= y_train[:10])

Fit the IDF Model


label,joy,sadness,joy,anger,love,anger,sadness,anger,sadness,love
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
joy,1.0,0.66285,0.70473,0.228687,0.566304,0.50753,0.583275,0.541933,0.54662,0.711151
sadness,0.66285,1.0,0.616139,0.296046,0.499304,0.555629,0.574476,0.510531,0.581215,0.644067
joy,0.70473,0.616139,1.0,0.246109,0.588937,0.535538,0.587511,0.638521,0.578051,0.698555
anger,0.228687,0.296046,0.246109,1.0,0.314674,0.36442,0.283305,0.385081,0.298118,0.400822
love,0.566304,0.499304,0.588937,0.314674,1.0,0.566616,0.425932,0.6602,0.492389,0.630707
anger,0.50753,0.555629,0.535538,0.36442,0.566616,1.0,0.502843,0.602613,0.542405,0.59226
sadness,0.583275,0.574476,0.587511,0.283305,0.425932,0.502843,1.0,0.497679,0.605178,0.646762
anger,0.541933,0.510531,0.638521,0.385081,0.6602,0.602613,0.497679,1.0,0.623918,0.598154
sadness,0.54662,0.581215,0.578051,0.298118,0.492389,0.542405,0.605178,0.623918,1.0,0.602339
love,0.711151,0.644067,0.698555,0.400822,0.630707,0.59226,0.646762,0.598154,0.602339,1.0


In [165]:
pipe_svm = Pipeline([
    ('w2v', W2VEmbeddings(weights= None, w2v_model= goog_wordvecs)),
    ('svc', SGDClassifier(loss= 'hinge'))
])
svm_grid = GridSearchCV(pipe_svm, cv= 3, scoring= 'accuracy', param_grid= {'svc__alpha':[0.001, 0.01]}, verbose= 1)
%time svm_grid.fit(X_train[:1000], y_train.head(1000))
print(svm_grid.best_params_)

%time y_pred = svm_grid.best_estimator_.predict(X_test)

print(f"SVM accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 2 candidates, totalling 6 fits


100%|██████████| 666/666 [00:00<00:00, 5753.35it/s]
100%|██████████| 666/666 [00:00<00:00, 9495.12it/s]
100%|██████████| 334/334 [00:00<00:00, 10268.78it/s]
100%|██████████| 667/667 [00:00<00:00, 10443.06it/s]
100%|██████████| 667/667 [00:00<00:00, 6036.88it/s]
100%|██████████| 333/333 [00:00<00:00, 8192.38it/s]
100%|██████████| 667/667 [00:00<00:00, 8139.64it/s]
100%|██████████| 667/667 [00:00<00:00, 8015.08it/s]
100%|██████████| 333/333 [00:00<00:00, 7914.05it/s]
100%|██████████| 666/666 [00:00<00:00, 8019.13it/s]
100%|██████████| 666/666 [00:00<00:00, 8706.51it/s]
100%|██████████| 334/334 [00:00<00:00, 8819.22it/s]
100%|██████████| 667/667 [00:00<00:00, 9993.93it/s]
100%|██████████| 667/667 [00:00<00:00, 9217.64it/s]
100%|██████████| 333/333 [00:00<00:00, 9994.01it/s]
100%|██████████| 667/667 [00:00<00:00, 8578.73it/s]
100%|██████████| 667/667 [00:00<00:00, 9823.04it/s]
100%|██████████| 333/333 [00:00<00:00, 10192.91it/s]
100%|██████████| 1000/1000 [00:00<00:00, 6879.47it/s]
100%|██

CPU times: total: 422 ms
Wall time: 2.28 s
{'svc__alpha': 0.001}


100%|██████████| 99880/99880 [00:13<00:00, 7436.05it/s] 


CPU times: total: 8.94 s
Wall time: 18 s
SVM accuracy is: 0.57.

              precision    recall  f1-score   support

       anger       0.49      0.29      0.36     13889
        fear       0.55      0.43      0.49     11776
         joy       0.60      0.79      0.68     33625
        love       0.45      0.18      0.25      8144
     sadness       0.56      0.65      0.60     28916
    surprise       0.62      0.28      0.38      3530

    accuracy                           0.57     99880
   macro avg       0.54      0.44      0.46     99880
weighted avg       0.56      0.57      0.55     99880



In [159]:
pipe_knn = Pipeline([
    ('w2v', W2VEmbeddings(weights= None, w2v_model= goog_wordvecs)),
    ('knn', KNeighborsClassifier(n_neighbors= 10, metric= 'cosine', weights= 'distance'))
])
pipe_knn.fit(X_train[:10000], y_train.head(10000))

y_pred = pipe_knn.predict(X_test)
print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

100%|██████████| 10000/10000 [00:01<00:00, 8761.05it/s]
100%|██████████| 10000/10000 [00:01<00:00, 9852.53it/s]
100%|██████████| 99880/99880 [00:14<00:00, 6887.97it/s] 


KNN classification accuracy is: 0.63.

              precision    recall  f1-score   support

       anger       0.63      0.48      0.54     13889
        fear       0.59      0.49      0.53     11776
         joy       0.67      0.75      0.70     33625
        love       0.55      0.37      0.44      8144
     sadness       0.61      0.73      0.66     28916
    surprise       0.63      0.30      0.40      3530

    accuracy                           0.63     99880
   macro avg       0.61      0.52      0.55     99880
weighted avg       0.62      0.63      0.62     99880

