In [368]:
import os
import pickle
from glob import glob
from nlp import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [369]:
dataset = load_dataset('csv', delimiter='\t',
                       data_files={'train': 'data/EmoEvalEs/train.tsv',
                                    'validation': 'data/EmoEvalEs/dev.tsv',})
test = load_dataset('csv', delimiter='\t', data_files={'test': 'data/EmoEvalEs/emoevales_test.tsv'})
test = pd.read_csv('data/EmoEvalEs/emoevales_test.tsv', sep='\t', quoting=csv.QUOTE_NONE)

Using custom data configuration default
Using custom data configuration default


In [370]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from scipy.sparse import hstack
ohe = OneHotEncoder()
feat1_train = ohe.fit_transform(np.array(dataset['train']['event']).reshape(-1,1))
feat1_dev = ohe.transform(np.array(dataset['validation']['event']).reshape(-1,1))
feat1_test = ohe.transform(np.array(test['event']).reshape(-1,1))

oe = OrdinalEncoder()
feat2_train = oe.fit_transform(np.array(dataset['train']['offensive']).reshape(-1,1))
feat2_dev = oe.transform(np.array(dataset['validation']['offensive']).reshape(-1,1))
feat2_test = oe.transform(np.array(test['offensive']).reshape(-1,1))

In [371]:
def load_feats(name, fold):
    path = 'feats/{}_{}.pck'.format(name, fold)
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj
meaningcloud_dev = load_feats('meaningcloud', 'dev')
meaningcloud_test = load_feats('meaningcloud', 'test')

In [372]:
def read_preds(name, fold):
    path = 'preds_{}/{}.pck'.format(fold, name)
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

preds = {'dev': dict(), 'test': dict()}

preds['dev']['xlmroberta']= read_preds('xlmroberta', 'dev')\
    .set_index('id').loc[dataset['validation']['id']]['emotion'].values
preds['test']['xlmroberta']= pd.read_csv('preds_test/submission-roberta-final.tsv',
                                         header=None, sep='\t', names=['id', 'emotion'])\
    .set_index('id').loc[test['id']]['emotion'].values

for fold in ('dev', 'test'):
    for preds_dev in glob('preds_{}/*.pck'.format(fold)):
        name = os.path.splitext(os.path.basename(preds_dev))[0]
        if name in ('xlmroberta', 'xlmroberta-extrafeatures'):
            continue
        preds[fold][name] = read_preds(name, fold)
print('dev', list(preds['dev'].keys()))
print('test', list(preds['dev'].keys()))

dev ['xlmroberta', 'w2v', 'ngramfeats', 'tfidf', 'simon', 'ngram']
test ['xlmroberta', 'w2v', 'ngramfeats', 'tfidf', 'simon', 'ngram']


In [373]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

metrics = dict()
for name in preds['dev'].keys():
    labels = dataset['validation']['emotion']
    class_rep = classification_report(labels, preds['dev'][name])
    metrics[name] = {
        'acc': accuracy_score(labels, preds['dev'][name]),
        'f1': f1_score(labels, preds['dev'][name], average='weighted')
    }
    print(name)
    print(class_rep)
    print()

  _warn_prf(average, modifier, msg_start, len(result))


xlmroberta
              precision    recall  f1-score   support

       anger       0.70      0.68      0.69        85
     disgust       0.00      0.00      0.00        16
        fear       1.00      0.44      0.62         9
         joy       0.70      0.66      0.68       181
      others       0.73      0.85      0.78       414
     sadness       0.80      0.80      0.80       104
    surprise       0.60      0.09      0.15        35

    accuracy                           0.73       844
   macro avg       0.65      0.50      0.53       844
weighted avg       0.71      0.73      0.71       844


w2v
              precision    recall  f1-score   support

       anger       0.60      0.44      0.50        85
     disgust       0.00      0.00      0.00        16
        fear       1.00      0.11      0.20         9
         joy       0.54      0.38      0.44       181
      others       0.64      0.81      0.72       414
     sadness       0.68      0.66      0.67       104
    surp

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [374]:
metrics = pd.DataFrame(metrics).T
metrics

Unnamed: 0,acc,f1
xlmroberta,0.731043,0.710999
w2v,0.61019,0.588116
ngramfeats,0.64218,0.61951
tfidf,0.569905,0.596302
simon,0.663507,0.624579
ngram,0.626777,0.601933


# Ensemble

In [408]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.svm import SVC
from scipy.stats import uniform, randint
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

ohe = OneHotEncoder()
X_dev = pd.DataFrame(preds['dev']).values
X_dev = ohe.fit_transform(X_dev)
y_dev = dataset['validation']['emotion']
X_test = ohe.transform(pd.DataFrame(preds['test']).values)


final_preds = dict()
# lr
# lr = LogisticRegression(solver='liblinear', max_iter=200, random_state=42)
# distributions = dict(C=uniform(loc=0, scale=10), penalty=['l2', 'l1'])
# clf = RandomizedSearchCV(lr, distributions, random_state=42)
# clf.fit(X_dev, y_dev)
# final_preds['ensemble_lr'] = cross_val_predict(clf.best_estimator_, X_dev, y_dev)

# random forest
# rf = RandomForestClassifier()
# distributions = dict(n_estimators=randint(1, 500))
# clf = RandomizedSearchCV(rf, distributions, random_state=42)
# clf.fit(X_dev, y_dev)
# final_preds['ensemble_rf'] = cross_val_predict(clf.best_estimator_, X_dev, y_dev)

#SVM
# svm = SVC(random_state=42)
# distributions = dict(C=uniform(loc=0, scale=10))
# clf = RandomizedSearchCV(svm, distributions, random_state=42)
# clf.fit(X_dev, y_dev)
# final_preds['ensemble_svm'] = cross_val_predict(clf.best_estimator_, X_dev, y_dev)

# lr + feats
# X_dev_feats =hstack((X_dev, feat1_dev.todense(), feat2_dev))
# lr = LogisticRegression(solver='liblinear', max_iter=300, random_state=42,
#                         class_weight='balanced',)
# distributions = dict(C=uniform(loc=0, scale=10), penalty=['l2', 'l1'])
# clf = RandomizedSearchCV(lr, distributions, random_state=42)
# clf.fit(X_dev, y_dev)
# final_preds['ensemble_lr+feats'] = cross_val_predict(clf.best_estimator_, X_dev_feats, y_dev)


# svm + feats
# X_dev_feats =hstack((X_dev, feat1_dev.todense(), feat2_dev))
# svm =svm = SVC(random_state=42)
# distributions = dict(C=uniform(loc=0, scale=10))
# clf = RandomizedSearchCV(svm, distributions, random_state=42)
# clf.fit(X_dev, y_dev)
# final_preds['ensemble_svm+feats'] = cross_val_predict(clf.best_estimator_, X_dev_feats, y_dev)

# random forest
# X_dev_feats =hstack((X_dev, feat1_dev.todense(), feat2_dev))
# rf = RandomForestClassifier()
# distributions = dict(n_estimators=randint(1, 500))
# clf = RandomizedSearchCV(rf, distributions, random_state=42)
# clf.fit(X_dev_feats, y_dev)
# final_preds['ensemble_rf+feats'] = cross_val_predict(clf.best_estimator_, X_dev, y_dev)

# lr + feats + meaningcloud
X_dev_feats_mc =hstack((X_dev, feat1_dev.todense(), feat2_dev, meaningcloud_dev))

lr = LogisticRegression(solver='liblinear', max_iter=300, random_state=42,
                        class_weight='balanced',)
pipe = Pipeline([
    ('pprocess', Normalizer()),
    ('lr', lr),
])
distributions = dict(lr__C=uniform(loc=0, scale=10), lr__penalty=['l2', 'l1'],)
clf = RandomizedSearchCV(pipe, distributions, random_state=42)
clf.fit(X_dev_feats_mc, y_dev)
final_preds['ensemble_lr+feats+mc'] = cross_val_predict(clf.best_estimator_, X_dev_feats_mc, y_dev)

for name, p in final_preds.items():
    print(name)
    print(classification_report(y_dev, final_preds[name]))
    print()
    metrics.loc[name] = [
        accuracy_score(y_dev, final_preds[name]),
        f1_score(y_dev, final_preds[name], average='weighted'),
    ] 

ensemble_lr+feats+mc
              precision    recall  f1-score   support

       anger       0.68      0.62      0.65        85
     disgust       0.19      0.19      0.19        16
        fear       0.42      0.56      0.48         9
         joy       0.69      0.65      0.67       181
      others       0.75      0.82      0.79       414
     sadness       0.80      0.80      0.80       104
    surprise       0.42      0.14      0.21        35

    accuracy                           0.72       844
   macro avg       0.56      0.54      0.54       844
weighted avg       0.71      0.72      0.71       844




In [409]:
clf.best_estimator_

Pipeline(steps=[('pprocess', Normalizer()),
                ('lr',
                 LogisticRegression(C=0.5641157902710026,
                                    class_weight='balanced', max_iter=300,
                                    penalty='l1', random_state=42,
                                    solver='liblinear'))])

In [410]:
metrics

Unnamed: 0,acc,f1
xlmroberta,0.731043,0.710999
w2v,0.61019,0.588116
ngramfeats,0.64218,0.61951
tfidf,0.569905,0.596302
simon,0.663507,0.624579
ngram,0.626777,0.601933
ensemble_lr,0.71564,0.689328
ensemble_rf,0.693128,0.675578
ensemble_lr+feats,0.709716,0.707557
ensemble_rf+feats,0.680095,0.664358


In [378]:
print(metrics.to_markdown())

|                      |      acc |       f1 |
|:---------------------|---------:|---------:|
| xlmroberta           | 0.731043 | 0.710999 |
| w2v                  | 0.61019  | 0.588116 |
| ngramfeats           | 0.64218  | 0.61951  |
| tfidf                | 0.569905 | 0.596302 |
| simon                | 0.663507 | 0.624579 |
| ngram                | 0.626777 | 0.601933 |
| ensemble_lr          | 0.71564  | 0.689328 |
| ensemble_rf          | 0.693128 | 0.675578 |
| ensemble_lr+feats    | 0.709716 | 0.707557 |
| ensemble_rf+feats    | 0.680095 | 0.664358 |
| ensemble_lr+feats+mc | 0.7109   | 0.71138  |


# Export predictions

In [379]:
# i don't want to continue the execution
raise

RuntimeError: No active exception to reraise

In [380]:
X_test

<1656x34 sparse matrix of type '<class 'numpy.float64'>'
	with 9936 stored elements in Compressed Sparse Row format>

In [411]:
# final model
# lr + feats + meaningcloud
X_dev_feats_mc =hstack((X_dev, feat1_dev.todense(), feat2_dev, meaningcloud_dev))

lr = LogisticRegression(solver='liblinear', max_iter=300, random_state=42,
                        class_weight='balanced',)
pipe = Pipeline([
    ('pprocess', Normalizer()),
    ('lr', lr),
])
distributions = dict(lr__C=uniform(loc=0, scale=10), lr__penalty=['l2', 'l1'],)
clf = RandomizedSearchCV(pipe, distributions, random_state=42)
clf.fit(X_dev_feats_mc, y_dev)


X_test_feats_mc =hstack((X_test, feat1_test.todense(), feat2_test, meaningcloud_test))
selected_test_preds = clf.best_estimator_.predict(X_test_feats_mc)

In [412]:
pd.DataFrame(zip(test['id'], selected_test_preds.astype(str)))\
.to_csv('preds_test/test_preds_ensemble_lr-feats-mc.tsv', header=None, index=False, sep='\t')