In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 95% !important; }</style>"))
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
import sys
import os
import re
import numpy as np
import pandas as pd

# from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
from lib import util, embeds, fitter, masker, features, sentiment, translate

In [None]:
# FIXME: add lines for git clone SemeEval2022 and Stitch data

In [None]:
datapath = 'SemEval_2022_Task2-idiomaticity/SubTaskA/Data'

Load all the CSV files in dataframes.

In [None]:
frames = util.load_csv_dataframes(datapath)

In [None]:
frames.keys()

In [None]:
zdf = frames['train_zero_shot.csv']
odf = frames['train_one_shot.csv']
ddf = frames['dev.csv']
ddf_gold = frames['dev_gold.csv']
edf = frames['eval.csv']

In [None]:
z_emb = embeds.get_embeddings(zdf)
z_emb_i = embeds.get_embeddings(zdf, append=['MWE'])

In [None]:
multilingual_model = 'distiluse-base-multilingual-cased-v1'

### Sentence transformers embeddings

Get sentence-transformers embeddings with the best method (appending MWE to the text, ignoring context).

The "best" method isn't actually completely true, as the original paper uses the "idiomatic principle" to encode the MWE, that is, using it as a single token when tokenizing.

In [None]:
z_emb_multi = embeds.get_embeddings(zdf, modelname=multilingual_model, append=['MWE'])

In [None]:
d_emb_multi = embeds.get_embeddings(ddf, modelname=multilingual_model, append=['MWE'])

Do a fitting for the embeddings with Logistic Regression.

In [None]:
z_score, z_probs, z_results = fitter.get_fit_results(z_emb_multi, zdf['Label'], d_emb_multi, ddf_gold['Label'])

In [None]:
z_score

In [None]:
dres = fitter.add_results(ddf, z_results, ddf_gold)

In [None]:
dres

In [None]:
dres_counts = util.get_counts(dres)

Show the MWEs that the model gets wrong more than half of the time. Are there any patterns?

In [None]:
dres_counts[dres_counts['Pct correct'] < 0.5].sort_values(by=['Language','MWE'])

### Mask filling

Get several features based on mask-filling pipeline.

Rationale: It should be more difficult to get mask filling to work when the MWE is idiomatic.

There are three ways to do mask filling for the MWE:
- replace the whole expression: banana republic -\> \<mask\>
- replace the first term: \<mask\> republic
- replace the second term: banana \<mask\>

The mask filling generates several features:
- Hassub: whether a top-5 term is found in the MWE (exactly)
- Top score: the confidence score of the top term
- Short/FS/SS: Amount of "Short" terms (less than three characters) in whole mask vs first term replacement vs second term replacement, respectively

Additionally, the top terms are recorded into Top score columns.

The Top score is only recorded for an "acceptable" term (at least three characters and no non-word characters) 

In [None]:
zdf_masked = masker.get_masked_features(zdf)

In [None]:
zdf_masked.groupby(['Language','Label','Hassub'])['DataID'].count()

In [None]:
ddf_masked = masker.get_masked_features(ddf)

In [None]:
ddf_masked[ddf_masked['Hassub'] == False][535:600]

In [None]:
ddf_masked.groupby(['Language','Hassub'])['ID'].count()

In [None]:
str_prob = 'Além de ter sido um fracasso de bilheteria e crítica, o filme acabou marcado pelos seus efeitos especiais, principalmente ao antropomorfizar os gatos, que, bem, ficam um pouco bisonhos.'
str_prob_2 = 'Professor livre docente da Unesp, Fortaleza é presidente da Sociedade Paulista de Infectologia e membro do Comitê de Contingência da COVID-19, do Governo do Estado de São Paulo.'
str_prob_3 = 'Com a segurança da imunização em massa e os números traduzindo sua eficácia, fica mais fácil para o americano médio sentir-se confiante em marcar sua próxima viagem, gerando um circulo virtuoso para o setor nos próximos meses.'


In [None]:
masker.replacer2(str_prob_3, 'círculo virtuoso', '<mask>', ' ')

In [None]:
masker.replacer2(str_prob, 'efeito especial', 'efeito <mask>', ' ')

In [None]:
masker.replace_mask_token(str_prob_2, 'livre-docente', 'livre-<mask>')

In [None]:
masker.replace_mask_token(str_prob, 'efeito especial', 'efeito <mask>', '<mask>')

In [None]:
masker.replace_mask_token(str_prob, 'efeito especial', '<mask>')

### Boolean features

Get features: Caps and Quotes.

Rationale:
- MWEs in Caps (Banana Republic vs banana republic) are more likely to be a proper noun (PN)
- Quoted MWEs are more likely to be idiomatic

In [None]:
zdf_masked_feats = features.get_features(zdf_masked)

In [None]:
zdf_masked_feats.groupby(['Language','Label','Caps'])['DataID'].count()

In [None]:
ddf_masked_feats = features.get_features(ddf_masked)

In [None]:
# dir(sentiment)

### Sentiment classifier

Rationale: idiomatic expressions are more likely to be affective (positive or negative).

Neutral sentiment probability is used as a proxy for literality.

In [None]:
sentiment_classifier, sentiment_tokenizer, sentiment_config = sentiment.get_classifier_tokenizer()

In [None]:
sentiment.get_sentiment(ddf_masked_feats['Target'].values[0], sentiment_classifier, sentiment_tokenizer, sentiment_config)

In [None]:
ddf_masked_feats_sent = sentiment.get_df_sentiments(ddf_masked_feats, sentiment_classifier, sentiment_tokenizer, sentiment_config)

In [None]:
zdf_masked_feats_sent = sentiment.get_df_sentiments(zdf_masked_feats, sentiment_classifier, sentiment_tokenizer, sentiment_config)

In [None]:
zdf_masked_feats_sent[zdf_masked_feats_sent['Label'] == '0'].mean()

In [None]:
zdf_masked_feats_sent[zdf_masked_feats_sent['Label'] == '1'].mean()

In [None]:
btmodel1, btmodel2, bttoken1, bttoken2 = translate.get_marian_models()

### Backtranslation

Translate text from English to Portuguese and back (and vice versa if the source language is Portuguese).

Rationale: the expression is more likely to be idiomatic if it is not found from the backtranslation.

In [None]:
zdf_bt = translate.backtranslate(zdf_masked_feats_sent, btmodel1, btmodel2, bttoken1, bttoken2, batch_len=10)

In [None]:
ddf_bt = translate.backtranslate(ddf_masked_feats_sent, btmodel1, btmodel2, bttoken1, bttoken2, batch_len=10)

In [None]:
# zdf_bt.sort_values(by="BT", key=lambda x: x.str.len())

In [None]:
zdf_bt2 = translate.record_trans(zdf_bt)

In [None]:
ddf_bt2 = translate.record_trans(ddf_bt)

In [None]:
zdf_bt2.groupby(['Language','Label','Trans'])['DataID'].count()

In [None]:
ddf_bt2.groupby(['Language','Trans'])['ID'].count()

In [None]:
# zdf_bt3 = translate.record_trans(zdf_bt)
# zdf_bt4 = translate.record_trans(zdf_bt)
# zdf_bt_3 = translate.record_trans(zdf_bt[zdf_bt['MWE'] == 'círculo virtuoso'])
# zdf_bt3.groupby(['Language','Label','Trans'])['DataID'].count()
# zdf_bt3[zdf_bt3['Trans'] != zdf_bt2['Trans']]
# zdf_bt4[zdf_bt3['Trans'] != zdf_bt4['Trans']]
# zdf_bt_3[(zdf_bt_3['Trans'] == False) & (zdf_bt_3['MWE'] == 'círculo virtuoso')][['Target', 'Label', 'MWE', 'BT', 'Trans']]

In [None]:
zdf_bt2[zdf_bt2['Label'] == '0'].mean()

In [None]:
zdf_bt2[zdf_bt2['Label'] == '1'].mean()

In [None]:
# Refactoring
# FIXME: Combined classifier:
# - get_trainable: ok
# - multi_results
# - check_feats: ok
# FIXME: Previous/next/target code; get_cosine_diff

In [None]:
# New features
# FIXME: "idiomatic" feature vs "literal" feature (Trans vs Caps/Hassub)
# FIXME: Save results to disk
# FIXME: run feature extractors externally
# Translation: handle en vs non-en

In [None]:
zdf_t = fitter.get_trainable(zdf_bt2)

In [None]:
ddf_t = fitter.get_trainable(ddf_bt2)

In [None]:
ddf_feat_score, ddf_feat_probs, ddf_feat_results = fitter.get_fit_results(zdf_t, zdf['Label'], ddf_t, ddf_gold['Label'])

In [None]:
ff = fitter.check_feats(zdf_t, zdf['Label'], ddf_t, ddf_gold['Label'])

In [None]:
ff.sort_values(by=['Score'], ascending=True)

In [None]:
# ff[ff['Sentiment']].sort_values(by=['Score'], ascending=True)[-20:]

In [None]:
ff_rf = fitter.check_feats(zdf_t, zdf['Label'], ddf_t, ddf_gold['Label'], method="rf")

In [None]:
ff_rf.sort_values(by=['Score'])