In [177]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import re
import nltk
# nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

import spacy
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import recall_score, accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.svm import LinearSVC, SVC

import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.models import Word2Vec

## Import pre-processed data

Write a function to import pre-processed data for modelling. Currently, just reading from a previously saved csv file.


In [61]:
# Clean text
def clean_text(text):

    # remove punctuation
    reg_punc =re.compile(r'[^\w\s]')
    text = reg_punc.sub(r'', text)

    # remove html
    reg_html = re.compile(r'<.*?>')
    text = reg_html.sub(r'', text)

    # remove url
    reg_url = re.compile(r'http\S+')
    text = reg_url.sub(r'', text)

    # remove numerical values
    reg_num = re.compile(r'[0-9]')
    text = reg_num.sub(r'', text)

    # remove special characters
    reg_spcl = re.compile('[@_!#$%^&*()<>?/\\|}{~:]')
    text = reg_spcl.sub(r'', text)

    # remove emoji
    emoji_url = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_url.sub(r'', text)

    return text

In [66]:
data = pd.read_csv("data/dev_data.csv")
data['text'] = data['text'].apply(lambda x: clean_text(x))

emotions = data['label'].unique().tolist()
emotions.sort()
emotions

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [67]:
X_train,X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=0,
                                                   test_size= 0.3, stratify= data['label'])

In [73]:
def preprocess(X):
    w2v_data = []
    for tt in tqdm(X):
        w2v_data.append([ww for ww in word_tokenize(tt.lower()) if ww not in stop_words])
    return w2v_data

In [237]:
w2v_train = preprocess(X_train)
w2v_test = preprocess(X_test)

model = Word2Vec(w2v_train, min_count = 1, window = 3, vector_size= 50)
model.wv.most_similar('love')

100%|██████████| 233053/233053 [00:34<00:00, 6814.44it/s]
100%|██████████| 99880/99880 [00:13<00:00, 7233.16it/s]


[('loving', 0.7861563563346863),
 ('miss', 0.7359917759895325),
 ('loves', 0.7110063433647156),
 ('loved', 0.6903106570243835),
 ('passion', 0.6851310729980469),
 ('caring', 0.684219241142273),
 ('spirit', 0.6817525029182434),
 ('thank', 0.6717989444732666),
 ('joy', 0.6585392951965332),
 ('jesus', 0.6555297374725342)]

In [238]:
def sentence_to_vector(X):
    # convert sentence to vectors by taking a simple average of all word embeddings
    X_w2v = []
    for vv in tqdm(X):
        try:
            X_w2v.append(model.wv.get_mean_vector(vv))
        except:
            X_w2v.append(np.zeros(model.wv.get_mean_vector(['anger']).shape))
    return X_w2v

In [164]:
X_train_w2v = sentence_to_vector(w2v_train)
X_test_w2v = sentence_to_vector(w2v_test)

100%|██████████| 233053/233053 [00:10<00:00, 21995.88it/s]
100%|██████████| 99880/99880 [00:06<00:00, 16645.13it/s]


In [277]:
# Create a Word2Vec Transformer
class W2VEmbeddings(TransformerMixin):
    def __init__(self, w2v_model= None):
        self.w2v_model = w2v_model
        # self.word2weight = None
        # self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y=None, **fit_params):
        w2v_X = preprocess(X)
        w2v_model = Word2Vec(w2v_X, min_count = 1, window = 3, vector_size= 50)
        self.w2v_model = w2v_model
        print(self.w2v_model.wv.most_similar('love'))
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_w2v = []
        for vv in preprocess(X):
            print(vv)
            X_w2v.append(np.mean([self.w2v_model.wv[w] for w in vv if w in self.w2v_model.wv.key_to_index.keys()] 
                    or [np.zeros(model.vector_size)], axis= 0))
        return X_w2v


In [285]:
X_train.head(1)

194726    i always want our home to be a place where oth...
Name: text, dtype: object

In [287]:
# test_w2v = W2VEmbeddings().fit(X_train[:50000])
test_w2v.transform(['love is the best feeling'])

100%|██████████| 50000/50000 [00:08<00:00, 6224.13it/s]


[('friends', 0.9595770835876465), ('supporting', 0.9590832591056824), ('god', 0.9581246972084045), ('share', 0.9572946429252625), ('accepted', 0.9559061527252197), ('loved', 0.9543684720993042), ('caring', 0.9524101614952087), ('live', 0.9520652294158936), ('special', 0.9519128203392029), ('sincere', 0.9513068795204163)]


100%|██████████| 1/1 [00:00<?, ?it/s]

['love', 'best', 'feeling']





[array([-0.07077488,  0.21535026,  0.1976145 ,  0.32435048, -0.38505396,
        -0.93124455,  1.3812528 ,  1.052912  , -0.97332543, -0.85820675,
         0.03818871, -1.188398  ,  0.6946807 ,  0.16509019, -1.4486612 ,
         0.11223182,  1.0000077 , -0.22863756, -1.6671696 , -0.7130013 ,
         0.55658495,  0.78586847,  0.95190483, -0.75732344,  0.7915918 ,
         0.71529895, -0.9535775 ,  0.10080209, -0.968029  , -0.81822824,
        -0.22225536, -0.2586255 ,  0.51767635,  0.81950474, -1.1094187 ,
         0.9794965 ,  0.61075383, -0.13021822,  0.06925955, -0.50232595,
         0.45609066, -0.349906  , -0.27678287, -0.0663076 ,  1.2981023 ,
        -0.06319097, -0.7450393 , -0.18962996,  0.6082306 ,  0.70886546],
       dtype=float32)]

In [174]:
knn = KNeighborsClassifier(n_neighbors= 10, metric= 'cosine', weights= 'distance')
knn_grid = GridSearchCV(knn, cv= 3, scoring= 'accuracy', param_grid= {'n_neighbors':[1, 5, 10, 50, 100, 200]}, verbose= 1)
%time knn_grid.fit(X_train_w2v[:50000], y_train.head(50000))
print(knn_grid.best_params_)

%time y_pred = knn_grid.best_estimator_.predict(X_test_w2v)

print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
CPU times: total: 4.66 s
Wall time: 7.06 s
{'n_neighbors': 100}
CPU times: total: 26.5 s
Wall time: 32.5 s
KNN classification accuracy is: 0.53.

              precision    recall  f1-score   support

       anger       0.56      0.08      0.13     13889
        fear       0.60      0.10      0.17     11776
         joy       0.58      0.84      0.69     33625
        love       0.57      0.04      0.07      8144
     sadness       0.47      0.76      0.58     28916
    surprise       0.33      0.00      0.01      3530

    accuracy                           0.53     99880
   macro avg       0.52      0.30      0.27     99880
weighted avg       0.54      0.53      0.44     99880



In [171]:
svm = SGDClassifier(loss= 'hinge')
svm.fit(X_train_w2v, y_train)

%time y_pred = svm.predict(X_test_w2v)

print(f"KNN classification accuracy is: {accuracy_score(y_test, y_pred):0.2f}.\n")
print(classification_report(y_test, y_pred))

CPU times: total: 0 ns
Wall time: 50 ms
KNN classification accuracy is: 0.54.

              precision    recall  f1-score   support

       anger       0.55      0.10      0.17     13889
        fear       0.72      0.09      0.15     11776
         joy       0.59      0.89      0.71     33625
        love       0.26      0.07      0.11      8144
     sadness       0.53      0.72      0.61     28916
    surprise       0.14      0.12      0.13      3530

    accuracy                           0.54     99880
   macro avg       0.46      0.33      0.31     99880
weighted avg       0.54      0.54      0.47     99880



In [208]:
X = [" ".join(x) for x in w2v_train[:10]]
X

['always want home place others feel welcomed loved comfortable',
 'know like work always underlining feeling ignored forgotten',
 'hate able make feel better',
 'id memory feelings insincere',
 'make mood feel horny',
 'feel angry depressed work steal glance boss feelings dissipate',
 'feel like hated person planet turning brendon',
 'im feeling slightly agitated today cant assed put better mood',
 'get stuff else end feeling lame sitting around house thumb butt',
 'feel today going passionate day one want show one feelings situation whether romantic sense value either personally socially']

In [222]:
from collections import defaultdict

tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(w2v_train[:10])
# if a word was never seen - it must be at least as infrequent
# as any of the known words - so the default idf is the max of 
# known idf's
max_idf = max(tfidf.idf_)
word2weight = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

pd.DataFrame.from_dict({word:tfidf.idf_[i] for word,i in tfidf.vocabulary_.items()}, orient= 'index').sort_values(0)

Unnamed: 0,0
feel,1.451985
feelings,2.011601
feeling,2.011601
always,2.299283
today,2.299283
...,...
personally,2.704748
planet,2.704748
turning,2.704748
memory,2.704748


In [224]:
word2weight['amateur']


2.7047480922384253