In [None]:
!pip install transformers datasets sacremoses accelerate -q

In [None]:
%cd <path_to_running_code>

In [None]:
train_batch_size = 32
lr = 1e-05
max_seq_len = 256
seed = 42

In [None]:
## give the dataset paths
amcd_path = ''
semeval_path = ''
trcd_path = ''

data_path_dict = {
  'amz_en': {
      'train': f'{amcd_path}/data/EN-ext_train.tsv',
      'eval': f'{amcd_path}/data/EN-ext_valid.tsv',
      'test': f'{amcd_path}/data/EN-ext_test.tsv'
  },
  'amz_de': {
      'train': f'{amcd_path}/data/DE_train.tsv',
      'eval': f'{amcd_path}/data/DE_valid.tsv',
      'test': f'{amcd_path}/data/DE_test.tsv'
  },
  'amz_jp': {
      'train': f'{amcd_path}/data/JP_train.tsv',
      'eval': f'{amcd_path}/data/JP_valid.tsv',
      'test': f'{amcd_path}/data/JP_test.tsv'
  },
    'semeval': {
      'train': f'{semeval_path}/Subtask-1/subtask1_train_train.csv', # 90% of original train
      'eval': f'{semeval_path}/Subtask-1/subtask1_train_eval.csv', # 10% of original eval
      'test': f'{semeval_path}/Subtask-1/subtask1_test.csv'
  },
    'tr': {
      'train': f'{trcd_path}/train.csv',
      'eval': f'{trcd_path}/valid.csv',
      'test': f'{trcd_path}/test.csv'
  }
}

In [None]:
import pandas as pd

data_field_dict = {
  'amz_en': {
    'text': 'sentence',
    'label': 'is_counterfactual'
  },
    'amz_de': {
    'text': 'sentence',
    'label': 'is_counterfactual'
  },
    'amz_jp': {
    'text': 'sentence',
    'label': 'is_counterfactual'
  },
  'semeval': {
    'text': 'sentence',
    'label': 'gold_label'
  },
  'tr': {
    'text': 'sentence',
    'label': 'label'
  }
}

tr_cw2re_dict = {'-mAlIydI': '(?i)(?<=\\w)malıydı|(?<=\\w)meliydi',
 '-sA': '(?i)(?<=\\w)sa|(?<=\\w)se',
 '-sAlArDI': '(?i)(?<=\\w)salardı|(?<=\\w)selerdi',
 '-AydI': '(?i)(?<=\\w)aydı|(?<=\\w)eydi|(?<=\\w)saydı|(?<=\\w)seydi',
 '-AymIş': '(?i)(?<=\\w)aymış|(?<=\\w)eymiş|(?<=\\w)saymış|(?<=\\w)seymiş',
 '-ArdI': '(?i)(?<=\\w)ardı|(?<=\\w)erdi|(?<=\\w)ırdı|(?<=\\w)irdi|(?<=\\w)urdu|(?<=\\w)ürdü',
 '-AcAkDI': '(?i)(?<=\\w)acaktı|(?<=\\w)ecekti',
 '-AmAz-DI': '(?i)(?<=\\w)amazdı|(?<=\\w)emezdi',
 '-mAz-DI': '(?i)(?<=\\w)mazdı|(?<=\\w)mezdi',
 '-Abil-Ar-DI': '(?i)(?<=\\w)abilirdi|(?<=\\w)ebilirdi'}

def get_csv_tsv_data(data_path):
  if '.tsv' in data_path:
    return pd.read_csv(data_path, sep='\t')
  return pd.read_csv(data_path)

In [None]:
import os
def get_clue_words(lang):
  with open(f'{amcd_path}clue_words/counterfactual_clue_words_{lang}.txt') as f:
      clue_words = f.readlines()
  return [clue_word.strip() for clue_word in clue_words]

In [None]:
import re

def mask_cw(df, clue_words, text_field, mask_token):
  masked_sentences = []
  for text in df[text_field].values:
    for clue_word in clue_words:
      text = re.sub(fr'{clue_word}', mask_token, text)
    masked_sentences.append(text)
  df['masked_text'] = masked_sentences
  return df

def mask_cw_tr(df, clue_words, mask_token):
  def helper_mask_cw_tr(row, mapping):
      pattern = r'%s' % mapping.get(row['CW'], None)
      subbed = re.sub(pattern, mask_token, row['text'])
      return subbed
  df['masked_text'] = df.apply(lambda x: helper_mask_cw_tr(x, clue_words), axis=1)
  return df

In [None]:
def normalize_df(df, field_dict, masking=False):
  text_field = field_dict['text']
  label_field = field_dict['label']
  column_selector = [text_field, label_field]
  if masking:
    column_selector = ['masked_text'] + column_selector
  df = df[column_selector]
  return df.rename(columns={text_field: 'text',
                            label_field: 'label'})
def get_train_df(mapping, masking_strategy, mask_token, test_df_len=None):
  dfs = []
  for name, mask in mapping.items():
    if mask:
      df = get_csv_tsv_data(data_path_dict[name]['train'])
      if masking_strategy:
        if name == 'tr':
          df = mask_cw_tr(df, tr_cw2re_dict, mask_token)
        else:
          lang = 'en' if name == 'semeval' else name.split('_')[-1]
          df = mask_cw(df, get_clue_words(lang), data_field_dict[name]['text'], mask_token)
      normalized_df = normalize_df(df, data_field_dict[name], masking_strategy)
      dfs.append(normalized_df)
  if test_df_len:
    return pd.concat(dfs).sample(test_df_len, random_state=seed)
  return pd.concat(dfs)

def get_eval_df(mapping, test_df_len=None):
  dfs = []
  for name, mask in mapping.items():
    if mask:
      df = get_csv_tsv_data(data_path_dict[name]['eval'])
      normalized_df = normalize_df(df, data_field_dict[name])
      dfs.append(normalized_df)
  if test_df_len:
    return pd.concat(dfs).sample(test_df_len, random_state=seed)
  return pd.concat(dfs)

def get_test_df(name):
  df = get_csv_tsv_data(data_path_dict[name]['test'])
  normalized_df = normalize_df(df, data_field_dict[name])
  return normalized_df

In [None]:
from torch.utils.data import Dataset
import torch
from typing import List

class CFDataset(Dataset):

    def __init__(
            self,
            data: pd.DataFrame,
            tokenizer,
            labels: List[str],
            max_token_len: int = 256,
            text_field: str = 'text',
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len
        self.labels = labels
        self.text_field = text_field

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row[self.text_field]
        labels = data_row[self.labels]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.tensor(labels, dtype=torch.long)
        )

In [None]:
import itertools
train_datasets = []
options = ['amz_en', 'amz_de', 'amz_jp', 'semeval', 'tr']
for L in [
    1,
    # 2,
    # 3,
    # 4,
    # 5
    ]:
    for subset in itertools.combinations(options, L):
        train_datasets.append(list(subset))

In [None]:
""" USED MODELS

bert-base-multilingual-uncased
xlm-roberta-base
dbmdz/bert-base-turkish-cased
"""

import json
import numpy as np
from datasets import load_metric
from sklearn.metrics import recall_score
from transformers import EarlyStoppingCallback
from transformers import Trainer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
import seaborn as sn
import matplotlib.pyplot as plt
import gc
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AutoConfig, set_seed

set_seed(seed)

_EXPERIMENT_NAME = 'demo'
_MODEL_SUFFIX_SUFFIX = ''

EARLY_STOP_PATIENCE = 1
EPOCH_N = 50

_MASKS = [False, True]
_MODEL_PATHS = [
    'bert-base-multilingual-uncased',
    'xlm-roberta-base',
    'dbmdz/bert-base-turkish-cased'
    ]
_TRAIN_DATASETS = train_datasets

test_data_filter = [
    'amz_en',
    'amz_de',
    'amz_jp',
    'semeval',
    'tr',
]
_OVERWRITE = True


i = 0
for TRAIN_DATASETS in _TRAIN_DATASETS:
  for MASK in _MASKS:
    for MODEL_PATH in _MODEL_PATHS:
        for test_data_name in test_data_filter:

        i += 1
        total = len(_MASKS) * len(_MODEL_PATHS) * len(_TRAIN_DATASETS) * len(test_data_filter)
        model_name = MODEL_PATH.split('/')[-1]
        train_datasets_name = '-'.join(TRAIN_DATASETS)
        model_suffix = f'mask={MASK}_train={train_datasets_name}{_MODEL_SUFFIX_SUFFIX}'

        run_name = f"{model_name} {model_suffix}"

        run_dir = f"./thesis/{_EXPERIMENT_NAME}/best_trHP/model={model_name}_{model_suffix}"

        if os.path.exists(os.path.join(run_dir, 'results/tr')):
          if _OVERWRITE:
            print(f'\n----->>>>> OVERWRITING: {run_name}\n')
          else:
            print(f'\n----->>>>> SKIPPING: {run_name}\n')
            continue
        print(f'--> RUNNING (test: {test_data_name}) : {run_name}')


        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

        train_data_filter = {
            'amz_en': False,
            'amz_de': False,
            'amz_jp': False,
            'semeval': False,
            'tr': False
        }
        # get_data_dfs
        for name in TRAIN_DATASETS:
          train_data_filter[name] = True

        # get train split of the test dataset to dataset scaling
        test_train_data_len = len(get_train_df({test_data_name: True}, False, tokenizer.pad_token))
        # get actual train dataset split with scaling, no scaling if train-test dataset is the same
        train_df = get_train_df(train_data_filter, MASK, tokenizer.pad_token, test_train_data_len)

        if MASK:
          s = train_df[train_df.masked_text.str.contains(tokenizer.pad_token[1:-1])].sample(1)
          sample_text = s.text.values[0]
          sample_masked_text = s.masked_text.values[0]
          print(f'\n\n\n--- before masking: {sample_text}')
          print(f'--- after masking: {sample_masked_text}\n\n\n')

          print(f'\n--- before masking: {tokenizer.encode(sample_text)}')
          print(f'--- after masking: {tokenizer.encode(sample_masked_text)}\n\n\n')

        test_eval_df_len = len(get_eval_df({test_data_name: True}))
        eval_df = get_eval_df(train_data_filter, test_eval_df_len)

        print(f'train dataset {len(train_df)} for {train_datasets_name}')
        print(f'validation dataset {len(eval_df)} for {train_datasets_name}')

        text_field = 'text'
        label_field = 'label'

        ## crete datasets
        train_dataset = CFDataset(
            train_df,
            tokenizer,
            max_token_len=max_seq_len,
            labels=[label_field],
            text_field=text_field if not MASK else 'masked_text',
        )

        eval_dataset = CFDataset(
            eval_df,
            tokenizer,
            max_token_len=max_seq_len,
            labels=[label_field],
            text_field=text_field,
        )

        ## trainer
        metric = load_metric("f1")
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return metric.compute(predictions=predictions, references=labels)

        training_args = TrainingArguments(
            output_dir=run_dir,
            learning_rate=lr,
            per_device_train_batch_size=train_batch_size,
            seed=seed,
            logging_dir='./logs',
            logging_steps=10,
            num_train_epochs=EPOCH_N,
            evaluation_strategy="epoch",
            save_total_limit = 1,
            load_best_model_at_end=True, # use eval_loss
            metric_for_best_model='eval_loss',
            save_strategy = "epoch",
            report_to='none'
        )

        early_callback = EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[early_callback],
          )
        print(f"\n -----> RUNNING {i}th/{total} run\n")

        ## train
        trainer.train()


        ## test
        print(TRAIN_DATASETS, test_data_name)
        ## test datasets
        test_df = get_test_df(test_data_name)
        print(f'test dataset {len(test_df)} for {test_data_name}')
        test_dataset = CFDataset(
            test_df,
            tokenizer,
            max_token_len=max_seq_len,
            labels=[label_field],
            text_field=text_field,
        )

        ## predict
        result = trainer.predict(test_dataset)

        ## get_result
        predictions = np.argmax(result[0], axis=1)
        gold_labels = test_dataset.data[label_field].values
        test_df = test_dataset.data
        test_df['prediction']  = predictions

        ## create result path
        result_path = os.path.join(run_dir, f'results/{test_data_name}')
        if not os.path.exists(result_path):
            os.makedirs(result_path)
        test_df.to_csv(os.path.join(result_path, 'predictions.csv'), index=False)

        ## clf report
        report = classification_report(y_true=gold_labels, y_pred=predictions)
        # print(report)
        with open(os.path.join(result_path, 'classification_report.txt'), 'w') as f:
            f.write(report)

        ## metrics
        metric_dict = {
        'f1_macro': f1_score(gold_labels, predictions, average='macro'),
        'mcc': matthews_corrcoef(gold_labels, predictions),
        'acc': accuracy_score(gold_labels, predictions),
        # '----': '----',
        'f1_default': f1_score(gold_labels, predictions),
        'f1_weighted': f1_score(gold_labels, predictions, average='weighted'),
        }
        print(metric_dict)

        with open(os.path.join(result_path, 'metrics.json'), 'w') as fp:
          json.dump(metric_dict, fp)

        ## confusion matrix
        cm = confusion_matrix(y_true=gold_labels, y_pred=predictions)
        df_cm = pd.DataFrame(
            cm,
            index = ['Not CF', 'CF'],
            columns = ['Not CF', 'CF']
        )

        plt.figure(figsize = (10,7))
        sn.heatmap(df_cm, annot=True, cmap='Blues', fmt='g')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(os.path.join(result_path, 'confusion_matrix.png'))
        plt.close()

        torch.cuda.empty_cache()
        gc.collect()