In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import re
import nltk
# nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC, SVC

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import Word2Vec

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [2]:
# Clean text
def clean_text(text):

    # remove punctuation
    reg_punc =re.compile(r'[^\w\s]')
    text = reg_punc.sub(r'', text)

    # remove html
    reg_html = re.compile(r'<.*?>')
    text = reg_html.sub(r'', text)

    # remove url
    reg_url = re.compile(r'http\S+')
    text = reg_url.sub(r'', text)

    # remove numerical values
    reg_num = re.compile(r'[0-9]')
    text = reg_num.sub(r'', text)

    # remove special characters
    reg_spcl = re.compile('[@_!#$%^&*()<>?/\\|}{~:]')
    text = reg_spcl.sub(r'', text)

    # remove emoji
    emoji_url = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_url.sub(r'', text)

    return text

In [3]:
data = pd.read_csv("data/dev_data.csv")
data['text'] = data['text'].apply(lambda x: clean_text(x))

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [5]:
X_train,X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=0,
                                                   test_size= 0.3, stratify= data['label'])

In [6]:
def preprocess(X):
    w2v_data = []
    for tt in tqdm(X):
        w2v_data.append([ww for ww in word_tokenize(tt.lower()) if ww not in stop_words])
    return w2v_data

In [48]:
# w2v_train = preprocess(X_train)
# w2v_test = preprocess(X_test)

# model = Word2Vec(w2v_train, min_count = 1, window = 3, vector_size= 50)
model.wv.most_similar('loving')

[('caring', 0.8846673965454102),
 ('loves', 0.8031463623046875),
 ('love', 0.7764047980308533),
 ('gracious', 0.7666694521903992),
 ('trusting', 0.7659983038902283),
 ('supportive', 0.7656123042106628),
 ('compassionate', 0.7655137777328491),
 ('gods', 0.7332736849784851),
 ('affectionate', 0.7303548455238342),
 ('affection', 0.7222259640693665)]

In [38]:
def sentence_to_vector(X):
    # convert sentence to vectors by taking a simple average of all word embeddings
    return [np.zeros(model.vector_size) if 0== len(vv) else model.wv.get_mean_vector(vv, pre_normalize= False) for vv in tqdm(X)]

In [49]:
X_train_w2v = sentence_to_vector(w2v_train)
X_test_w2v = sentence_to_vector(w2v_test)

100%|██████████| 233053/233053 [00:08<00:00, 29109.27it/s]
100%|██████████| 99880/99880 [00:04<00:00, 21624.49it/s]


In [54]:
print(y_train[:5])

194726        joy
149286    sadness
108993        joy
152321      anger
251081       love
Name: label, dtype: object


In [60]:
pd.DataFrame(cosine_similarity(X_train_w2v[:10]), index = y_train[:10], columns= y_train[:10])

label,joy,sadness,joy,anger,love,anger,sadness,anger,sadness,love
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
joy,1.0,0.82935,0.822418,0.585457,0.704592,0.730838,0.793369,0.617241,0.739255,0.821148
sadness,0.82935,1.0,0.814826,0.638296,0.769566,0.861654,0.8677,0.784065,0.831201,0.861148
joy,0.822418,0.814826,1.0,0.550974,0.853964,0.743277,0.758577,0.781829,0.749143,0.831171
anger,0.585457,0.638296,0.550974,1.0,0.551553,0.786239,0.645865,0.554621,0.504831,0.767486
love,0.704592,0.769566,0.853964,0.551553,1.0,0.735415,0.725132,0.821895,0.7685,0.766302
anger,0.730838,0.861654,0.743277,0.786239,0.735415,1.0,0.793655,0.754247,0.742215,0.853328
sadness,0.793369,0.8677,0.758577,0.645865,0.725132,0.793655,1.0,0.628537,0.761067,0.858748
anger,0.617241,0.784065,0.781829,0.554621,0.821895,0.754247,0.628537,1.0,0.807576,0.727844
sadness,0.739255,0.831201,0.749143,0.504831,0.7685,0.742215,0.761067,0.807576,1.0,0.715168
love,0.821148,0.861148,0.831171,0.767486,0.766302,0.853328,0.858748,0.727844,0.715168,1.0


In [63]:
# Create a Word2Vec Transformer
class W2VEmbeddings(TransformerMixin):
    def __init__(self, w2v_model= None, weights = None):
        self.w2v_model = w2v_model
        self.weights = weights
        self.word2weight = None
        # self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y=None, **fit_params):
        w2v_X = preprocess(X)

        if self.w2v_model is None:
            print("Fitting new Word2Vec model on training data.")
            w2v_model = Word2Vec(w2v_X, min_count = 1, window = 3, vector_size= 100)
            self.w2v_model = w2v_model
            print("Done!")
            
        if self.weights == "idf":
            tfidf = TfidfVectorizer(analyzer= lambda x: x)
            tfidf.fit(w2v_X)
            # if a word was never seen - it must be at least as infrequent as any of the known words
            # so the default idf is the max of known idf's
            max_idf = max(tfidf.idf_)
            self.word2weight = defaultdict(
                lambda: max_idf,
                [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
            print("Fit the IDF Model")
        else:
            self.word2weight = defaultdict(lambda: 1)
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_w2v = []
        for vv in preprocess(X):
            X_w2v.append(np.mean([self.w2v_model.wv[w] * self.word2weight[w] for w in vv if w in self.w2v_model.wv.key_to_index.keys()] 
                    or [np.zeros(model.vector_size)], axis= 0))
        return X_w2v


In [73]:
test_w2v = W2VEmbeddings(weights= None, w2v_model= model).fit_transform(X_train[:10])
pd.DataFrame(cosine_similarity(test_w2v), index = y_train[:10], columns= y_train[:10])

None


100%|██████████| 10/10 [00:00<?, ?it/s]
100%|██████████| 10/10 [00:00<00:00, 1809.06it/s]


label,joy,sadness,joy,anger,love,anger,sadness,anger,sadness,love
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
joy,1.0,0.82935,0.822418,0.585457,0.704592,0.730838,0.793369,0.617241,0.739255,0.821148
sadness,0.82935,1.0,0.814826,0.638296,0.769566,0.861654,0.8677,0.784065,0.831201,0.861148
joy,0.822418,0.814826,1.0,0.550974,0.853964,0.743277,0.758577,0.781829,0.749143,0.831171
anger,0.585457,0.638296,0.550974,1.0,0.551553,0.786239,0.645865,0.554621,0.504831,0.767486
love,0.704592,0.769566,0.853964,0.551553,1.0,0.735415,0.725132,0.821895,0.7685,0.766302
anger,0.730838,0.861654,0.743277,0.786239,0.735415,1.0,0.793655,0.754247,0.742215,0.853328
sadness,0.793369,0.8677,0.758577,0.645865,0.725132,0.793655,1.0,0.628537,0.761067,0.858748
anger,0.617241,0.784065,0.781829,0.554621,0.821895,0.754247,0.628537,1.0,0.807576,0.727844
sadness,0.739255,0.831201,0.749143,0.504831,0.7685,0.742215,0.761067,0.807576,1.0,0.715168
love,0.821148,0.861148,0.831171,0.767486,0.766302,0.853328,0.858748,0.727844,0.715168,1.0


In [61]:
knn = KNeighborsClassifier(n_neighbors= 10, metric= 'cosine', weights= 'distance')
knn_grid = GridSearchCV(knn, cv= 3, scoring= 'accuracy', param_grid= {'n_neighbors':[1, 5, 10, 50, 100, 200]}, verbose= 1)
%time knn_grid.fit(X_train_w2v[:1000], y_train.head(1000))
print(knn_grid.best_params_)

%time y_pred = knn_grid.best_estimator_.predict(X_test_w2v)

print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
CPU times: total: 15.6 ms
Wall time: 200 ms
{'n_neighbors': 100}
CPU times: total: 5.59 s
Wall time: 7.19 s
KNN classification accuracy is: 0.49.

              precision    recall  f1-score   support

       anger       0.48      0.01      0.03     13889
        fear       0.59      0.02      0.04     11776
         joy       0.56      0.81      0.66     33625
        love       0.41      0.00      0.00      8144
     sadness       0.43      0.75      0.55     28916
    surprise       0.38      0.00      0.00      3530

    accuracy                           0.49     99880
   macro avg       0.48      0.27      0.21     99880
weighted avg       0.50      0.49      0.39     99880



In [62]:
svm = SGDClassifier(loss= 'hinge')
svm.fit(X_train_w2v, y_train)

%time y_pred = svm.predict(X_test_w2v)

print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

CPU times: total: 31.2 ms
Wall time: 125 ms
KNN classification accuracy is: 0.57.

              precision    recall  f1-score   support

       anger       0.47      0.13      0.20     13889
        fear       0.50      0.19      0.28     11776
         joy       0.65      0.86      0.74     33625
        love       0.53      0.06      0.10      8144
     sadness       0.51      0.79      0.62     28916
    surprise       0.21      0.12      0.15      3530

    accuracy                           0.57     99880
   macro avg       0.48      0.36      0.35     99880
weighted avg       0.54      0.57      0.50     99880



In [208]:
X = [" ".join(x) for x in w2v_train[:10]]
X

['always want home place others feel welcomed loved comfortable',
 'know like work always underlining feeling ignored forgotten',
 'hate able make feel better',
 'id memory feelings insincere',
 'make mood feel horny',
 'feel angry depressed work steal glance boss feelings dissipate',
 'feel like hated person planet turning brendon',
 'im feeling slightly agitated today cant assed put better mood',
 'get stuff else end feeling lame sitting around house thumb butt',
 'feel today going passionate day one want show one feelings situation whether romantic sense value either personally socially']

In [299]:
from collections import defaultdict

tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(w2v_train[:10])
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of 
# known idf's
max_idf = max(tfidf.idf_)
word2weight = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

pd.DataFrame.from_dict({word:tfidf.idf_[i] for word,i in tfidf.vocabulary_.items()}, orient= 'index').sort_values(0)

AttributeError: 'list' object has no attribute 'lower'

In [224]:
word2weight['amateur']


2.7047480922384253