# ARQMath

- https://www.cs.rit.edu/~dprl/ARQMath/
- https://www.cs.rit.edu/~dprl/ARQMath/Task1-answers.html
- https://httpd.test.gipp.com/qa-pair.csv
- https://github.com/deepset-ai/FARM/blob/master/examples/passage_ranking.py

### Requirements

```
pandas
tqdm
farm==0.4.2
transformers
```


In [19]:
import os
import random
import json
import pickle
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
!nvidia-smi

/bin/sh: nvidia-smi: command not found


In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # check with nvidia-smi

In [5]:
import logging
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import RegressionProcessor, TextPairClassificationProcessor
from farm.experiment import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import RegressionHead, TextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings, reformat_msmarco_train, reformat_msmarco_dev, write_msmarco_results
from farm.evaluation.msmarco_passage_farm import msmarco_evaluation

In [6]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO)

In [7]:
import torch
torch.cuda.is_available()

False

In [8]:
from experiments.environment import get_env
env = get_env()

/Volumes/data/repo/acl-anthology/environments
Environment detected: local_mac (in default.yml)


In [21]:
# Convert input to FARM TSV format: [text, text_b, label]
df = pd.read_csv(os.path.join(env['datasets_dir'], 'arqmath', 'qa-pair.csv'))

label_col = 'label'
df[label_col] = df['rel'].astype(int)
df = df.rename(columns=dict(q='text', a='text_b')).drop(columns=['qID', 'aID', 'rel'])

df.head()

Unnamed: 0,text,text_b,label
0,What are gradients and how would I use them?\n...,"The ∇ (pronounced ""del"") is an operator, more ...",1
1,How would you describe calculus in simple term...,There came a time in mathematics when people e...,1
2,How would you describe calculus in simple term...,One of the greatest achievements of human civi...,0
3,How would you describe calculus in simple term...,Calculus is basically a way of calculating rat...,0
4,How would you describe calculus in simple term...,Calculus is a field which deals with two seemi...,0


In [42]:
df = df.sample(n=500)

In [43]:
n_splits = 4
kf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)

# Stratified K-Folds cross-validator
for k, (train_index, test_index) in enumerate(kf.split(df.index.tolist(), df[label_col].values.tolist()), 1):
    split_train_df = df.iloc[train_index]
    split_test_df = df.iloc[test_index]
    break

In [44]:
train_df = split_train_df
test_df = split_test_df

print(f'Train: {len(train_df)}; Test: {len(test_df)}')

Train: 374; Test: 126


In [45]:
# Write to disk
train_df.to_csv(os.path.join(env['datasets_dir'], 'arqmath', 'train.tsv'), sep='\t', index=False)
test_df.to_csv(os.path.join(env['datasets_dir'], 'arqmath', 'test.tsv'), sep='\t', index=False)

In [46]:
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=torch.cuda.is_available())
n_epochs = 2
batch_size = 64
evaluate_every = 500
lang_model = "bert-base-cased"
lang_model_path = os.path.join(env['bert_dir'], lang_model)
label_list = ["0", "1"]

data_dir = Path(os.path.join(env['datasets_dir'], 'arqmath'))
#train_filename = "train.tsv"
train_filename = 'train.tsv'

#dev_filename = "dev_200k.tsv"
dev_filename = None #'qa-pair.csv'
test_filename = 'test.tsv'

# The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
"""
generate_data = False
data_dir = Path("../data/msmarco_passage")
predictions_raw_filename = "predictions_raw.txt"
predictions_filename = "predictions.txt"
train_source_filename = "triples.train.1m.tsv"
qrels_filename = "qrels.dev.tsv"
queries_filename = "queries.dev.tsv"
passages_filename = "collection.tsv"
top1000_filename = "top1000.dev"

# 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
# The final format is a tsv file with 3 columns (text, text_b and label)
if generate_data:
    reformat_msmarco_train(data_dir / train_source_filename,
                           data_dir / train_filename)
    reformat_msmarco_dev(data_dir / queries_filename,
                         data_dir / passages_filename,
                         data_dir / qrels_filename,
                         data_dir / top1000_filename,
                         data_dir / dev_filename)

"""

04/11/2020 17:13:33 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


'\ngenerate_data = False\ndata_dir = Path("../data/msmarco_passage")\npredictions_raw_filename = "predictions_raw.txt"\npredictions_filename = "predictions.txt"\ntrain_source_filename = "triples.train.1m.tsv"\nqrels_filename = "qrels.dev.tsv"\nqueries_filename = "queries.dev.tsv"\npassages_filename = "collection.tsv"\ntop1000_filename = "top1000.dev"\n\n# 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!\n# The final format is a tsv file with 3 columns (text, text_b and label)\nif generate_data:\n    reformat_msmarco_train(data_dir / train_source_filename,\n                           data_dir / train_filename)\n    reformat_msmarco_dev(data_dir / queries_filename,\n                         data_dir / passages_filename,\n                         data_dir / qrels_filename,\n                         data_dir / top1000_filename,\n                         data_dir / dev_filename)\n\n'

In [50]:
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model_path,
    do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
#    Evaluation during training will be performed on a slice of the train set
#    We will be using the msmarco dev set as our final evaluation set
processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                            label_list=label_list,
                                            train_filename=train_filename,
                                            test_filename=test_filename,
                                            #dev_split=0.001,
                                            dev_split=0.0,
                                            max_seq_len=128,
                                            data_dir=data_dir,
                                            delimiter="\t")
processor.add_task(name='text_classification', 
                   metric='seq_f1', 
                   label_list=label_list, 
                   label_column_name=label_col,
                   task_type='classification')


04/11/2020 17:14:26 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
04/11/2020 17:14:26 - INFO - transformers.tokenization_utils -   Model name '/Volumes/data/repo/data/bert/bert-base-cased' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1, bert-base-dutch-cased). Assuming '/Volumes/data/repo/data/bert/bert-base-cased' is a path, a model identifier, or url to a directory containing tokenizer files.
04/11/2020 17:14:26 - INFO - transformers.tokenization_utils

In [51]:
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -   Loading train set from: /Volumes/data/repo/data/arqmath/train.tsv 
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -   Got ya 7 parallel workers to convert 374 dictionaries to pytorch datasets (chunksize = 11)...
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0 
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -   /|\  /w\  /w\  /w\  /w\  /w\  /w\
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -   /'\  /'\  /'\  /'\  /'\  / \  /'\
04/11/2020 17:14:26 - INFO - farm.data_handler.data_silo -               
Preprocessing Dataset /Volumes/data/repo/data/arqmath/train.tsv:   0%|          | 0/374 [00:00<?, ? Dicts/s]04/11/20

Preprocessing Dataset /Volumes/data/repo/data/arqmath/train.tsv: 100%|██████████| 374/374 [00:01<00:00, 370.46 Dicts/s]
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -   No dev set is being loaded
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -   Loading test set from: /Volumes/data/repo/data/arqmath/test.tsv
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -   Got ya 7 parallel workers to convert 126 dictionaries to pytorch datasets (chunksize = 4)...
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0 
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -   /|\  /w\  /|\  /|\  /|\  /|\  /w\
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -   /'\  / \  /'\  /'\  /'\  /'\  / \
04/11/2020 17:14:27 - INFO - farm.data_handler.data_silo -               
Preprocessing Dataset /Volumes/data/repo/data/arqmath/test.tsv:   0%|          | 0/126 [00:00<?, ? Dicts/s]04/11/2020 17:14:28 - INFO - farm.data_handler.

Preprocessing Dataset /Volumes/data/repo/data/arqmath/test.tsv: 100%|██████████| 126/126 [00:00<00:00, 264.56 Dicts/s]
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Examples in train: 374
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Examples in dev  : 0
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Examples in test : 126
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     128
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 127.27540106951872
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.9705882352941176
04/11/2020 17:14:28 - INFO - farm.data_handler.data_silo -   [Farmer's Tip] 97.1% of your samples got cut down to 128 tokens. Consider increasing max_seq_len. This will lead to higher memory consumption but is likely to improv

In [52]:
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model_path)
# b) and a prediction head on top that is suited for our task
prediction_head = TextClassificationHead(num_labels=len(label_list),
                                         class_weights=data_silo.calculate_class_weights(
                                             task_name="text_classification"),
                                         )

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_sequence_continuous"],
    device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=1e-5,
    device=device,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs)

04/11/2020 17:14:39 - INFO - transformers.modeling_utils -   loading weights file /Volumes/data/repo/data/bert/bert-base-cased/pytorch_model.bin
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
04/11/2020 17:14:41 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 2]
04/11/2020 17:14:41 - INFO - farm.modeling.prediction_head -   Using class weights for task 'text_classification': [0.7230769230769231, 1.6206896551724137]
04/11/2020 17:14:41 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 1e-05}'
04/11/2020 17:14:41 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
04/11/2020 17:14:41 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 1.2000000000000002, 'num_training_steps': 12}'


In [53]:
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=n_epochs,
    n_gpu=n_gpu,
    lr_schedule=lr_schedule,
    evaluate_every=evaluate_every,
    device=device)

In [54]:
# 7. Let it grow
trainer.train()

04/11/2020 17:15:19 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/2 (Cur. train loss: 0.7168):  83%|████████▎ | 5/6 [02:11<00:25, 25.67s/it]

KeyboardInterrupt: 

In [None]:
# 8. Hooray! You have a model. Store it:
save_dir = Path("./output/arqmath/1")

model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
#    Add your own text adapted to the dataset you provide
model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
result = model.inference_from_file(data_dir / dev_filename)

write_msmarco_results(result, save_dir / predictions_raw_filename)

msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                   dev_file=data_dir / dev_filename,
                   qrels_file=data_dir / qrels_filename,
                   output_file=save_dir / predictions_filename)