In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [62]:
import csv
from nlp import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [67]:
dataset = load_dataset('csv', delimiter='\t',
                       data_files={'train': 'data/EmoEvalEs/train.tsv',
                                    'validation': 'data/EmoEvalEs/dev.tsv',})
test = load_dataset('csv', delimiter='\t', data_files={'test': 'data/EmoEvalEs/emoevales_test.tsv'})
_test = pd.read_csv('data/EmoEvalEs/emoevales_test.tsv', sep='\t', quoting=csv.QUOTE_NONE)

Using custom data configuration default
Using custom data configuration default


In [69]:
test.head()

Unnamed: 0,id,event,tweet,offensive
0,16b25dfb-e284-4a58-b62c-8186fc082eb6,GameOfThrones,"Señor de luz, ven a nosotros en nuestra oscuri...",NO
1,2a80f6bf-4750-4783-9bc8-fdb8ff2b94c4,SpainElection,Pues ya hemos votado tanto mi madre y yo #Elec...,NO
2,1f477a6f-3559-41ee-8ec5-2e77aee53190,WorldBookDay,"#DiaDelLibro 😇⭐❤️🙏🙌😍🌷 sentimientos, viajar con...",NO
3,838add70-748c-4635-8133-36ff0b05aeb0,WorldBookDay,"¡Feliz #DíaDelLibro📚! ”El libro es fuerza, es ...",NO
4,64e3dd59-ae5c-4b9b-bd29-987609eb95d8,SpainElection,"Pues qué queréis que os diga, este sarao polít...",NO


In [5]:
import pickle

def save_preds(obj, name, fold):
    path = 'preds_{}/{}.pck'.format(fold, name)
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

# Base classifiers

In [6]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV

In [71]:
from gsitk.preprocess import pprocess_twitter, Preprocessor

text_train = Preprocessor(pprocess_twitter).transform(dataset['train']['tweet'])
text_dev = Preprocessor(pprocess_twitter).transform(dataset['validation']['tweet'])
text_test = Preprocessor(pprocess_twitter).transform(test['tweet'])

In [72]:
all_texts = pd.Series(np.concatenate((text_train, text_dev, text_test), axis=0))

In [75]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
ohe = OneHotEncoder()
feat1_train = ohe.fit_transform(np.array(dataset['train']['event']).reshape(-1,1))
feat1_dev = ohe.transform(np.array(dataset['validation']['event']).reshape(-1,1))
feat1_test = ohe.transform(np.array(test['event']).reshape(-1,1))

oe = OrdinalEncoder()
feat2_train = oe.fit_transform(np.array(dataset['train']['offensive']).reshape(-1,1))
feat2_dev = oe.transform(np.array(dataset['validation']['offensive']).reshape(-1,1))
feat2_test = oe.transform(np.array(test['offensive']).reshape(-1,1))

## SIMON custom

In [11]:
import string
from collections import Counter
from itertools import chain
from nltk.corpus import stopwords
from gsitk.features import simon
from gensim.models import KeyedVectors

def simon_pipeline():
    simon_pipe = Pipeline([
        ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
    ])
    return simon_pipe

def generate_custom_lexicon(text):
    filter_words = set(stopwords.words('spanish')) | set(string.punctuation)

    counter = Counter(chain.from_iterable(text.str.split(' ').values))
    selection = sorted([(word, count) for word, count in counter.items()], key=lambda wc: wc[1], reverse=True)
    selection = [word for word, _ in selection if word not in filter_words]
    selection = [selection]
    return selection

In [12]:
print('Generating custom lexicon')
custom_lexicon = generate_custom_lexicon(all_texts)
print('Done')

# facebook fasttext embeddings
print('Loading embeddings')
embbeddings = KeyedVectors.load_word2vec_format(
    '/home/jovyan/work/projects/data/WordEmbeddings/eng/crawl-300d-2M.vec', binary=False)
print('Done')

spanish_lex = pd.read_csv('data/SpanishSentimentLexicons/fullStrengthLexicon.txt', header=None, sep='\t')[0].values

Generating custom lexicon
Done
Loading embeddings
Done


NameError: name 'load_resources' is not defined

In [78]:
_simon_model = simon.Simon(lexicon=custom_lexicon,
                           n_lexicon_words=2000,
                           embedding=embbeddings)
simon_model_custom = simon.simon_pipeline(simon_transformer=_simon_model, percentile=50)

print('Training and predicting SIMON feats')
X_simon_train = simon_model_custom.fit_transform(pd.Series(text_train).str.split(' '), dataset['train']['emotion'])
X_simon_dev = simon_model_custom.transform(pd.Series(text_dev).str.split(' '))
X_simon_test = simon_model_custom.transform(pd.Series(text_test).str.split(' '))
print('Done')

# join with external feats
X_simon_train = np.concatenate((X_simon_train, feat1_train.todense(), feat2_train), axis=1)
X_simon_dev = np.concatenate((X_simon_dev, feat1_dev.todense(), feat2_dev), axis=1)
X_simon_test = np.concatenate((X_simon_test, feat1_test.todense(), feat2_test), axis=1)


print('Training classifier and predicting')
simon_pipe = simon_pipeline()
simon_pipe.fit(X_simon_train, dataset['train']['emotion'])
simon_preds_dev = simon_pipe.predict(X_simon_dev)
simon_preds_test = simon_pipe.predict(X_simon_test)
print('Done')

Training and predicting SIMON feats
Done
Training classifier and predicting
Done


In [79]:
save_preds(simon_preds_dev, 'simon', 'dev')
save_preds(simon_preds_test, 'simon', 'test')

In [None]:
del X_simon_train
del X_simon_dev
del X_simon_test

## word2vecFeatures

In [80]:
from gsitk.features.word2vec import Word2VecFeatures

w2v_transformer = Word2VecFeatures(model=embbeddings)

print('Training and predicting SIMON feats')
X_w2v_train = w2v_transformer.fit_transform(pd.Series(text_train).str.split(' '), dataset['train']['emotion'])
X_w2v_dev = w2v_transformer.transform(pd.Series(text_dev).str.split(' '))
X_w2v_test = w2v_transformer.transform(pd.Series(text_test).str.split(' '))
print('Done')

# join with external feats
X_w2v_train = np.concatenate((X_w2v_train, feat1_train.todense(), feat2_train), axis=1)
X_w2v_dev = np.concatenate((X_w2v_dev, feat1_dev.todense(), feat2_dev), axis=1)
X_w2v_test = np.concatenate((X_w2v_test, feat1_test.todense(), feat2_test), axis=1)


print('Training classifier and predicting')
w2v_pipe = simon_pipeline()
w2v_pipe.fit(X_w2v_train, dataset['train']['emotion'])
w2v_preds_dev = w2v_pipe.predict(X_w2v_dev)
w2v_preds_test = w2v_pipe.predict(X_w2v_test)
print('Done')

save_preds(w2v_preds_dev, 'w2v', 'dev')
save_preds(w2v_preds_test, 'w2v', 'test')

Training and predicting SIMON feats
Done
Training classifier and predicting
Done


## TF-IDF

In [81]:
tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
])
tfidf_pipe.fit(text_train, dataset['train']['emotion'])
tfidf_preds_dev = tfidf_pipe.predict(text_dev)
tfidf_preds_test = tfidf_pipe.predict(text_test)

In [82]:
save_preds(tfidf_preds_dev, 'tfidf', 'dev')
save_preds(tfidf_preds_test, 'tfidf', 'test')

## ngrams

In [83]:
ngram_pipe = Pipeline([
    ('ngram', CountVectorizer(ngram_range=(1,2))),
    ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
])
ngram_pipe.fit(text_train, dataset['train']['emotion'])
ngram_preds_dev = ngram_pipe.predict(text_dev)
ngram_preds_test = ngram_pipe.predict(text_test)

save_preds(ngram_preds_dev, 'ngram', 'dev')
save_preds(ngram_preds_test, 'ngram', 'test')

## ngrams + feats

In [21]:
from scipy.sparse import hstack

In [84]:
cv = CountVectorizer(ngram_range=(1,2))
X_train = cv.fit_transform(text_train)
X_dev = cv.transform(text_dev)
X_test = cv.transform(text_test)

X_train = hstack([X_train, feat1_train, feat2_train])
X_dev = hstack([X_dev, feat1_dev, feat2_dev])
X_test = hstack([X_test, feat1_test, feat2_test])


ngram_pipe = Pipeline([
    ('lr', LogisticRegressionCV(cv=10, random_state=42, n_jobs=-1, solver='liblinear'))
])
ngram_pipe.fit(X_train, dataset['train']['emotion'])
ngram_preds_dev = ngram_pipe.predict(X_dev)
ngram_preds_test = ngram_pipe.predict(X_test)

save_preds(ngram_preds_dev, 'ngramfeats', 'dev')
save_preds(ngram_preds_test, 'ngramfeats', 'test')

## meaningcloud

In [35]:
import requests

# Desde Jupyterhub
HOST = ''

#ENDPOINT para sentimientos
ENDPOINT = HOST + '/sentiment-2.1'

with open('meaningcloud.key') as f:
    KEY = f.read()
    
def analyze(txt, lang='en', model=None):
    model = model or 'general_' + lang
    res = requests.post(ENDPOINT, data={'key': KEY, 'txt': txt, 'lang': lang, 'model': model})
    return res.json()

In [36]:
meaningcloud_train, meaningcloud_dev, meaningcloud_test = [], [], []
for instance in tqdm_notebook(dataset['train']['tweet']):
    resp = analyze(instance, lang='es')
    meaningcloud_train.append([resp['score_tag'], resp['agreement'], resp['subjectivity'],
            resp['confidence'], resp['irony']])

for instance in tqdm_notebook(dataset['validation']['tweet']):
    resp = analyze(instance, lang='es')
    meaningcloud_dev.append([resp['score_tag'], resp['agreement'], resp['subjectivity'],
            resp['confidence'], resp['irony']])

for instance in tqdm_notebook(test['test']['tweet']):
    resp = analyze(instance, lang='es')
    meaningcloud_test.append([resp['score_tag'], resp['agreement'], resp['subjectivity'],
            resp['confidence'], resp['irony']])
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for instance in tqdm_notebook(dataset['train']['tweet']):


  0%|          | 0/5723 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for instance in tqdm_notebook(dataset['validation']['tweet']):


  0%|          | 0/844 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for instance in tqdm_notebook(test['test']['tweet']):


  0%|          | 0/1626 [00:00<?, ?it/s]

In [86]:
meaningcloud_test = []
for instance in tqdm_notebook(test['tweet']):
    resp = analyze(instance, lang='es')
    meaningcloud_test.append([resp['score_tag'], resp['agreement'], resp['subjectivity'],
            resp['confidence'], resp['irony']])
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for instance in tqdm_notebook(test['tweet']):


  0%|          | 0/1656 [00:00<?, ?it/s]

In [87]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

ohe = OneHotEncoder()
oe = OrdinalEncoder()
sent_train = ohe.fit_transform(pd.DataFrame(meaningcloud_train)[0].values.reshape(-1,1))
others_train = oe.fit_transform(pd.DataFrame(meaningcloud_train)[[1,2,4]].values)
confidence_train = pd.DataFrame(meaningcloud_train)[3].values.astype(int).reshape(-1,1)/100
X_meaningcloud_train = np.concatenate((sent_train.todense(), others_train, confidence_train), axis=1)

sent_dev = ohe.transform(pd.DataFrame(meaningcloud_dev)[0].values.reshape(-1,1))
others_dev = oe.fit_transform(pd.DataFrame(meaningcloud_dev)[[1,2,4]].values)
confidence_dev = pd.DataFrame(meaningcloud_dev)[3].values.astype(int).reshape(-1,1)/100
X_meaningcloud_dev = np.concatenate((sent_dev.todense(), others_dev, confidence_dev), axis=1)


sent_test = ohe.transform(pd.DataFrame(meaningcloud_test)[0].values.reshape(-1,1))
others_test = oe.fit_transform(pd.DataFrame(meaningcloud_test)[[1,2,4]].values)
confidence_test = pd.DataFrame(meaningcloud_test)[3].values.astype(int).reshape(-1,1)/100
X_meaningcloud_test = np.concatenate((sent_test.todense(), others_test, confidence_test), axis=1)

In [88]:
def save_feats(obj, name, fold):
    path = 'feats/{}_{}.pck'.format(name, fold)
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
        
save_feats(X_meaningcloud_train, 'meaningcloud', 'train')
save_feats(X_meaningcloud_dev, 'meaningcloud', 'dev')
save_feats(X_meaningcloud_test, 'meaningcloud', 'test')