In [None]:
%%capture
!pip install transformers
!pip install constants
!pip install --upgrade torch
!wget https://raw.githubusercontent.com/huggingface/transformers/09a2f40684f77e62d0fd8485fe9d2d610390453f/examples/multiple-choice/utils_multiple_choice.py

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
import tqdm
import utils_multiple_choice
from constants import *
from google.colab import auth, drive
from transformers import (
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    TFAutoModelForMultipleChoice,
    TFTrainer,
    TFTrainingArguments,
    set_seed,
)
from utils_multiple_choice import MultipleChoiceDataset, RaceProcessor, Split, TFMultipleChoiceDataset, processors

In [None]:
from google.colab import auth, drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": simple_accuracy(preds, p.label_ids)}

In [None]:
SCORE_A = 'score A'
SCORE_B = 'score B'
SCORE_C = 'score C'
SCORE_D = 'score D'
SCORES = 'scores'
CORRECT = 'correct'
SCORE_LABEL = 'score_label'
SCORES_WRONG = 'score_wrong'
LABEL = 'label'
PREDICTION = 'prediction'
LEVEL = 'level'
SCORE_VAR = 'score variance'
LIST_SCORES = [SCORE_A, SCORE_B, SCORE_C, SCORE_D]
MAX_SEQ_LENGTH = 512
RANDOM_SEED = 3 # 0, 42, 1, 2, 3
MODEL_NAME = 'xlnet-base-cased'
DATA_DIR = "data/"

In [None]:
try:
    processor = processors['race']()
    label_list = processor.get_labels()
    num_labels = len(label_list)
except KeyError:
    raise ValueError("Task not found: %s" % ('race'))
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels, finetuning_task='race')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
lines = []
for level in ['high', 'middle']:
    input_dir = os.path.join(DATA_DIR, "train/" + level)
    files = glob.glob(input_dir + "/*txt")
    for file in tqdm.tqdm(files, desc="read files"):
        with open(file, "r", encoding="utf-8") as fin:
            data_raw = json.load(fin)
            data_raw["race_id"] = file
            lines.append(data_raw)

In [None]:
set_type = 'train'
examples = []
for (_, data_raw) in enumerate(lines):
    race_id = "%s-%s" % (set_type, data_raw["race_id"])
    article = data_raw["article"]
    for i in range(len(data_raw["answers"])):
        truth = str(ord(data_raw["answers"][i]) - ord("A"))
        question = data_raw["questions"][i]
        options = data_raw["options"][i]

        examples.append(
            utils_multiple_choice.InputExample(
                example_id=race_id,
                question=question,
                contexts=[article, article, article, article],  # this is not efficient but convenient
                endings=[options[0], options[1], options[2], options[3]],
                label=truth,
            )
        )
examples[0]

In [None]:
# make sure to use modified utils_multiple_choice.py to allow examples=examples otherwise it always cut the training dataset, not loading it properly
import importlib
importlib.reload(utils_multiple_choice)
from utils_multiple_choice import MultipleChoiceDataset

In [None]:
train_dataset = MultipleChoiceDataset(
        data_dir=DATA_DIR,
        tokenizer=tokenizer,
        task='race',
        max_seq_length=MAX_SEQ_LENGTH,
        overwrite_cache=True,
        mode=Split.train, examples=examples)

In [None]:
train_dataset = MultipleChoiceDataset(
        data_dir=DATA_DIR,
        tokenizer=tokenizer,
        task='race',
        max_seq_length=MAX_SEQ_LENGTH,
        overwrite_cache=False,
        mode=Split.train)

In [None]:
eval_dataset = MultipleChoiceDataset(
        data_dir=DATA_DIR,
        tokenizer=tokenizer,
        task='race',
        max_seq_length=MAX_SEQ_LENGTH,
        overwrite_cache=False,
        mode=Split.dev)

In [None]:
torch.manual_seed(RANDOM_SEED)

In [None]:
model = AutoModelForMultipleChoice.from_pretrained(MODEL_NAME, config=config)

In [None]:
def init_training_args(
    adam_epsilon=1e-8, 
    learning_rate=5e-5, 
    num_train_epochs=3.0, 
    weight_decay=0, 
    max_steps=-1,
    output_dir='drive/My Drive/Colab Data/race_results',          # output directory
    logging_dir='drive/My Drive/Colab Data/race_logs',            # directory for storing logs
    ):
  return TrainingArguments(
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    output_dir=output_dir,          # output directory
    logging_dir=logging_dir,            # directory for storing logs
    save_steps=5000,
    save_total_limit=5, 
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    adam_epsilon=adam_epsilon,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    weight_decay=weight_decay
)

In [None]:
configs = {
    'xlnet-base-cased': {
        'adam_epsilon': 1e-6,
        'learning_rate': 2e-5,
        'num_train_epochs': 2.0,
        'weight_decay': 0.01
    },
    'bert-base-cased': {
        'adam_epsilon': 1e-6,
        'learning_rate': 2e-5,
        'num_train_epochs': 2.0,
        'weight_decay': 0.05
    },
    'distilber-base-cased': {
        'adam_epsilon': 1e-8,
        'learning_rate': 5e-5,
        'num_train_epochs': 3,
        'weight_decay': 0.01
    }
}

In [None]:
training_args = init_training_args(
    adam_epsilon=configs[MODEL_NAME]['adam_epsilon'], 
    learning_rate=configs[MODEL_NAME]['learning_rate'], 
    num_train_epochs=configs[MODEL_NAME]['num_train_epochs'], 
    weight_decay=configs[MODEL_NAME]['weight_decay'],
    logging_dir=os.path.join(DATA_DIR, 'race_logs_{}_seed_{}'.format(MODEL_NAME, RANDOM_SEED)),            # directory for storing logs
    output_dir=os.path.join(DATA_DIR, 'race_results_{}_seed_{}'.format(MODEL_NAME, RANDOM_SEED)),          # output directory 
    )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)

In [None]:
trainer.evaluate()

# Previous (for XLNet)
#   {'eval_loss': 1.0538021816397252, 'eval_acc': 0.6355637405361162, 'step': 0}
#   {'eval_acc': 0.6355637405361162, 'eval_loss': 1.0538021816397252}

# seed = 0
# {'eval_loss': 1.039906019572468, 'eval_acc': 0.6402701043585022, 'epoch': 2.0, 'total_flos': 233886300450963456, 'step': 43934}
# {'epoch': 2.0,
#  'eval_acc': 0.6402701043585022,
#  'eval_loss': 1.039906019572468,
#  'total_flos': 233886300450963456}

# seed = 42
# 
# {'eval_loss': 1.048840732447283, 'eval_acc': 0.6351544915080827, 'epoch': 2.0, 'total_flos': 233886300450963456, 'step': 43934}
# {'epoch': 2.0,
#  'eval_acc': 0.6351544915080827,
#  'eval_loss': 1.048840732447283,
#  'total_flos': 233886300450963456}

# seed = 1
# 
# {'eval_loss': 1.0707706642666928, 'eval_acc': 0.6159197871905054, 'epoch': 2.0, 'total_flos': 233886300450963456, 'step': 43934}
# {'epoch': 2.0,
#  'eval_acc': 0.6159197871905054,
#  'eval_loss': 1.0707706642666928,
#  'total_flos': 233886300450963456}

# seed = 2
# 
# {'eval_loss': 1.0715846247221732, 'eval_acc': 0.6044608144055658, 'epoch': 2.0, 'total_flos': 233886300450963456, 'step': 43934}
# {'epoch': 2.0,
#  'eval_acc': 0.6044608144055658,
#  'eval_loss': 1.0715846247221732,
#  'total_flos': 233886300450963456}

# seed = 3
# 
# {'eval_loss': 1.049155939741198, 'eval_acc': 0.6396562308164518, 'epoch': 2.0, 'total_flos': 233886300450963456, 'step': 43934}

# {'epoch': 2.0,
#  'eval_acc': 0.6396562308164518,
#  'eval_loss': 1.049155939741198,
#  'total_flos': 233886300450963456}

In [None]:
test_dataset = MultipleChoiceDataset(
        data_dir=DATA_DIR,
        tokenizer=tokenizer,
        task='race',
        max_seq_length=MAX_SEQ_LENGTH,
        overwrite_cache=True,
        mode=Split.test)

In [None]:
def transform_pandas(trainer, dataset):
    pred = trainer.predict(dataset)
    vars = []
    for x in pred.predictions:
        vars.append(np.var(x))
    print(np.mean(vars))
    df = pd.DataFrame(columns=['idx', 'level', 'document_id', 'label', 'prediction', 'score A', 'score B', 'score C', 'score D', 'score variance'])
    for idx in range((len(dataset))):
        sample = [idx]
        result = pred.predictions[idx]
        df = df.append(
            pd.Series(
                [
                    idx, 
                    dataset[idx].example_id.split("/")[-2], 
                    dataset[idx].example_id.split("/")[-1],
                    dataset[idx].label,
                    int(np.argmax(result)),
                    result[0],
                    result[1],
                    result[2],
                    result[3],
                    np.std(result)
                ],
                index = ['idx', 'level', 'document_id', 'label', 'prediction', 'score A', 'score B', 'score C', 'score D', 'score variance']), 
                ignore_index=True
            )
    return df, pred

In [None]:
df_train, pred_train = transform_pandas(trainer, train_dataset)
df_train.to_csv(os.path.join(DATA_DIR, 'output_{}_seed_{}_train.csv'.format(MODEL_NAME, RANDOM_SEED)), index=False)
df_train.sample(2)
pickle.dump(pred_train, open(os.path.join(DATA_DIR, 'output_{}_seed_{}_pred_train.p'.format(MODEL_NAME, RANDOM_SEED)), 'wb'))

In [None]:
df_test, pred_test = transform_pandas(trainer, test_dataset)
df_test.to_csv(os.path.join(DATA_DIR, 'output_{}_seed_{}_test.csv'.format(MODEL_NAME, RANDOM_SEED)), index=False)
df_test.head()
pickle.dump(pred_test, open(os.path.join(DATA_DIR, 'output_{}_seed{}_pred_test.p'.format(MODEL_NAME, RANDOM_SEED)), 'wb'))

In [None]:
df_eval, pred_eval = transform_pandas(trainer, eval_dataset)
df_eval.to_csv(os.path.join(DATA_DIR, 'output_{}_seed_{}_eval.csv'.format(MODEL_NAME, RANDOM_SEED)), index=False)
df_eval.head()
pickle.dump(pred_eval, open(os.path.join(DATA_DIR, 'output_{}_seed_{}_pred_eval.p'.format(MODEL_NAME, RANDOM_SEED)), 'wb'))