In [87]:
from nlp import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [2]:
dataset = load_dataset('csv', delimiter='\t',
                       data_files={'train': 'data/EmoEvalEs/train.tsv',
                                    'validation': 'data/EmoEvalEs/dev.tsv',})
test = load_dataset('csv', delimiter='\t', data_files={'test': 'data/EmoEvalEs/emoevales_test.tsv'})

Using custom data configuration default
Using custom data configuration default


# Dataset stats

In [3]:
print('TRAIN')
print(len(dataset['train']))
print()

print('DEV')
print(len(dataset['validation']))
print()

print('TEST')
print(len(test['test']))
print()

TRAIN
5723

DEV
844

TEST
1626



In [4]:
print('TRAIN')
print(pd.Series(dataset['train']['emotion']).value_counts(normalize=True))
print()

print('DEV')
print(pd.Series(dataset['validation']['emotion']).value_counts(normalize=True))
print()

TRAIN
others      0.489254
joy         0.214398
sadness     0.121090
anger       0.102918
surprise    0.041587
disgust     0.019395
fear        0.011358
dtype: float64

DEV
others      0.490521
joy         0.214455
sadness     0.123223
anger       0.100711
surprise    0.041469
disgust     0.018957
fear        0.010664
dtype: float64



# Simple pipeline

In [15]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV

In [11]:
from gsitk.preprocess import pprocess_twitter, Preprocessor

text_train = Preprocessor(pprocess_twitter).transform(dataset['train']['tweet'])
text_dev = Preprocessor(pprocess_twitter).transform(dataset['validation']['tweet'])
text_test = Preprocessor(pprocess_twitter).transform(test['test']['tweet'])

In [62]:
spanish_lex = pd.read_csv('data/SpanishSentimentLexicons/fullStrengthLexicon.txt', header=None, sep='\t')[0].values

In [38]:
# SIMON
import string
from collections import Counter
from itertools import chain
from nltk.corpus import stopwords
from gsitk.features import simon
from gensim.models import KeyedVectors

def generate_custom_lexicon(text):
    filter_words = set(stopwords.words('spanish')) | set(string.punctuation)

    counter = Counter(chain.from_iterable(text.str.split(' ').values))
    selection = sorted([(word, count) for word, count in counter.items()], key=lambda wc: wc[1], reverse=True)
    selection = [word for word, _ in selection if word not in filter_words]
    selection = [selection]
    return selection

all_texts = pd.Series(np.concatenate((text_train, text_dev, text_test), axis=0))
custom_lexicon = generate_custom_lexicon(all_texts)

# facebook fasttext embeddings
embbeddings = KeyedVectors.load_word2vec_format(
    '/home/jovyan/work/projects/data/WordEmbeddings/eng/crawl-300d-2M.vec', binary=False)

In [None]:
from itertools import product

simons = dict()
class_reports = dict()

n_lex_words = [200, 500, 700, 1000, 2000]
percentiles = [10, 25, 50, 100]

combinations = list(product(n_lex_words, percentiles))
for lex_i_name, lex_i in {'custom': custom_lexicon, 'emotion': spanish_lex}.items():
    for combination in tqdm_notebook(combinations):
        n_lexicon_words_i = combination[0]
        percentile_i = combination[1]

        _simon_model = simon.Simon(lexicon=lex_i,
                                   n_lexicon_words=n_lexicon_words_i, embedding=embbeddings)
        simon_model = simon.simon_pipeline(simon_transformer=_simon_model, percentile=percentile_i)

        X_simon_train = simon_model.fit_transform(pd.Series(text_train).str.split(' '), dataset['train']['emotion'])
        X_simon_dev = simon_model.transform(pd.Series(text_dev).str.split(' '))

        simon_pipe = Pipeline([
            ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
        ])
        simon_pipe.fit(X_simon_train, dataset['train']['emotion'])
        simon_preds_dev = simon_pipe.predict(X_simon_dev)
        class_reports[lex_i_name+str(combination)] = classification_report(
            dataset['validation']['emotion'], simon_preds_dev)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for combination in tqdm_notebook(combinations):


  0%|          | 0/20 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  0%|          | 0/20 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [106]:
for comb, report in class_reports.items():
    print(comb)
    print(report)
    print()

custom(200, 10)
              precision    recall  f1-score   support

       anger       0.61      0.20      0.30        85
     disgust       0.00      0.00      0.00        16
        fear       0.83      0.56      0.67         9
         joy       0.75      0.07      0.12       181
      others       0.55      0.98      0.70       414
     sadness       0.84      0.41      0.55       104
    surprise       0.00      0.00      0.00        35

    accuracy                           0.57       844
   macro avg       0.51      0.32      0.33       844
weighted avg       0.60      0.57      0.48       844


custom(200, 25)
              precision    recall  f1-score   support

       anger       0.68      0.18      0.28        85
     disgust       0.00      0.00      0.00        16
        fear       0.80      0.44      0.57         9
         joy       0.69      0.25      0.37       181
      others       0.61      0.93      0.74       414
     sadness       0.81      0.44      0.57  

BESTION OPTION SEEMS TO BE:
- lexicon: custom
- number of lexicon words: 2000
- percentile: 50

In [71]:
preds_dev = dict()

In [72]:
# ngram_pipe = Pipeline([
#     ('ngrams', CountVectorizer(ngram_range=(1,2))),
#     ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
# ])
# ngram_pipe.fit(text_train, dataset['train']['emotion'])
# preds_dev['ngram']  = pipe.predict(text_dev)




In [73]:
for preds_dev_name in preds_dev.keys():
    print(preds_dev_name)
    print(classification_report(dataset['validation']['emotion'], preds_dev[preds_dev_name]))
    print()

simon
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        85
     disgust       0.00      0.00      0.00        16
        fear       0.00      0.00      0.00         9
         joy       0.20      0.01      0.01       181
      others       0.49      0.99      0.66       414
     sadness       0.00      0.00      0.00       104
    surprise       0.00      0.00      0.00        35

    accuracy                           0.48       844
   macro avg       0.10      0.14      0.10       844
weighted avg       0.28      0.48      0.32       844




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
for preds_dev_name in preds_dev.keys():
    print(preds_dev_name)
    print(classification_report(dataset['validation']['emotion'], preds_dev[preds_dev_name]))
    print()

ngram
              precision    recall  f1-score   support

       anger       0.49      0.34      0.40        85
     disgust       0.00      0.00      0.00        16
        fear       1.00      0.33      0.50         9
         joy       0.54      0.46      0.50       181
      others       0.65      0.82      0.72       414
     sadness       0.73      0.64      0.68       104
    surprise       0.38      0.09      0.14        35

    accuracy                           0.62       844
   macro avg       0.54      0.38      0.42       844
weighted avg       0.60      0.62      0.60       844


simon
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        85
     disgust       0.00      0.00      0.00        16
        fear       0.00      0.00      0.00         9
         joy       0.20      0.01      0.01       181
      others       0.49      0.99      0.66       414
     sadness       0.20      0.01      0.02       104
    surpris

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
