In [7]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import re
import nltk
# nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC, SVC

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import Word2Vec

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [61]:
# Clean text
def clean_text(text):

    # remove punctuation
    reg_punc =re.compile(r'[^\w\s]')
    text = reg_punc.sub(r'', text)

    # remove html
    reg_html = re.compile(r'<.*?>')
    text = reg_html.sub(r'', text)

    # remove url
    reg_url = re.compile(r'http\S+')
    text = reg_url.sub(r'', text)

    # remove numerical values
    reg_num = re.compile(r'[0-9]')
    text = reg_num.sub(r'', text)

    # remove special characters
    reg_spcl = re.compile('[@_!#$%^&*()<>?/\\|}{~:]')
    text = reg_spcl.sub(r'', text)

    # remove emoji
    emoji_url = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_url.sub(r'', text)

    return text

In [66]:
data = pd.read_csv("data/dev_data.csv")
data['text'] = data['text'].apply(lambda x: clean_text(x))

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [67]:
X_train,X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=0,
                                                   test_size= 0.3, stratify= data['label'])

In [73]:
def preprocess(X):
    w2v_data = []
    for tt in tqdm(X):
        w2v_data.append([ww for ww in word_tokenize(tt.lower()) if ww not in stop_words])
    return w2v_data

In [74]:
w2v_train = preprocess(X_train)
w2v_test = preprocess(X_test)

model = Word2Vec(w2v_train, min_count = 1, window = 5)
model.wv.most_similar('angry')

100%|██████████| 233053/233053 [00:25<00:00, 8981.65it/s]
100%|██████████| 99880/99880 [00:11<00:00, 8699.25it/s]


[('upset', 0.8806291222572327),
 ('furious', 0.8045730590820312),
 ('mad', 0.783480703830719),
 ('enraged', 0.7723531126976013),
 ('scared', 0.7678281664848328),
 ('frustrated', 0.7629511952400208),
 ('sad', 0.7609350681304932),
 ('annoyed', 0.7546423077583313),
 ('hateful', 0.7446823120117188),
 ('hurt', 0.7360586524009705)]

In [95]:
def sentence_to_vector(X):
    # convert sentence to vectors by taking a simple average of all word embeddings
    X_w2v = []
    for vv in tqdm(X):
        try:
            X_w2v.append(model.wv.get_mean_vector(vv))
        except:
            X_w2v.append(np.zeros(model.wv.get_mean_vector(['anger']).shape))
    return X_w2v

In [100]:
X_train_w2v = sentence_to_vector(w2v_train)
X_test_w2v = sentence_to_vector(w2v_test)

100%|██████████| 233053/233053 [00:10<00:00, 22443.75it/s]
100%|██████████| 99880/99880 [00:05<00:00, 19027.85it/s]


In [None]:
knn = BaggingClassifier(estimator= KNeighborsClassifier(n_neighbors= 10, metric= 'cosine', weights= 'distance'), 
                        n_estimators= 10, max_samples= 0.1)
knn_grid = GridSearchCV(knn, cv= 3, scoring= 'accuracy', param_grid= {'estimator__n_neighbors': 
                                                                      [1, 5, 10, 20, 50]}, verbose= 3)
%time knn_grid.fit(X_train_w2v[:50000], y_train.head(50000))
# print(knn_grid.best_params_)

# # %time y_pred = knn_grid.best_estimator_.predict(X_test_w2v)

# # print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
# # print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END ..........estimator__n_neighbors=1;, score=0.442 total time=   3.8s
[CV 2/3] END ..........estimator__n_neighbors=1;, score=0.438 total time=   4.1s
[CV 3/3] END ..........estimator__n_neighbors=1;, score=0.434 total time=   3.8s
[CV 1/3] END ..........estimator__n_neighbors=5;, score=0.487 total time=   7.4s
[CV 2/3] END ..........estimator__n_neighbors=5;, score=0.489 total time=   8.2s
[CV 3/3] END ..........estimator__n_neighbors=5;, score=0.489 total time=   7.2s
[CV 1/3] END .........estimator__n_neighbors=10;, score=0.491 total time=   7.4s
[CV 2/3] END .........estimator__n_neighbors=10;, score=0.496 total time=   7.8s
[CV 3/3] END .........estimator__n_neighbors=10;, score=0.494 total time=   7.5s
[CV 1/3] END .........estimator__n_neighbors=20;, score=0.495 total time=   7.5s
[CV 2/3] END .........estimator__n_neighbors=20;, score=0.497 total time=   8.2s
[CV 3/3] END .........estimator__n_neighbors=20;,

In [None]:
nb = SGDClassifier(loss= 'hinge')
nb.fit(X_train_w2v, y_train)

%time y_pred = nb.predict(X_test_w2v)

print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

CPU times: total: 46.9 ms
Wall time: 92.7 ms
KNN classification accuracy is: 0.54.

              precision    recall  f1-score   support

       anger       0.39      0.15      0.22     13889
        fear       0.64      0.09      0.15     11776
         joy       0.58      0.89      0.70     33625
        love       0.41      0.02      0.04      8144
     sadness       0.51      0.71      0.59     28916
    surprise       0.28      0.05      0.08      3530

    accuracy                           0.54     99880
   macro avg       0.47      0.32      0.30     99880
weighted avg       0.51      0.54      0.46     99880

