In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from datasets import Dataset
import transformers as tn

from transformers import (AutoTokenizer, PreTrainedTokenizerFast,
                          AutoModelForQuestionAnswering, TrainingArguments,
                          Trainer, default_data_collator, DataCollatorWithPadding,)

from tqdm.autonotebook import tqdm

import gc
import collections

In [2]:
test = pd.read_csv('../chaii-hindi-and-tamil-question-answering/test.csv')
train = pd.read_csv('../chaii-hindi-and-tamil-question-answering/train.csv')

In [3]:
train.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil


In [4]:
test.head()

Unnamed: 0,id,context,question,language
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil


In [5]:
#add answer ends
def add_answer_ends(examples):
    answer_ends = []
    for index, row in examples.iterrows():
        answer_ends.append(row['answer_start'] + len(row['answer_text']))
    
    examples.insert(loc = 5, column = "answer_end", value = answer_ends)
    
    return examples

In [6]:
train = add_answer_ends(train)

In [7]:
def prepare_train_features(examples, tokenizer):

    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    features = []
    for i, encoding in enumerate(tokenized_examples.encodings):
        feature = {}
        feature['ids'] = encoding.ids
        feature['attention_mask'] = encoding.attention_mask
        feature['offset'] = encoding.offsets
        feature['token'] = encoding.tokens
        #feature['token_type_ids'] = encoding.type_ids
        class_index = encoding.ids.index(tokenizer.cls_token_id)
        
        for j in range (len(encoding.sequence_ids)):
            if encoding.sequence_ids[j] != None and encoding.sequence_ids[j] == 1:
                context_start_idx = j
                context_start_offset = encoding.offsets[j][0]
                break

        feature['start_position'] = feature['end_position'] = class_index
        for j in range(context_start_idx, len(encoding.offsets)):
            offset = encoding.offsets[j]
            if offset[0] <= examples['answer_start'] and examples['answer_start'] < offset[1]:
                feature['start_position'] = j
            if offset[0] < examples['answer_end'] and examples['answer_end'] <= offset[1]:
                feature['end_position'] = j
                break
        if feature['start_position'] == class_index or feature['end_position'] == class_index:
            feature['start_position'] = feature['end_position'] = class_index
        features.append(feature)            
    return features


In [8]:
def prepare_validation_features(examples):

    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []
    
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [9]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

#since we have long texts of context, we take chunks at a time
#starting at 128 chunks per text
doc_stride = 128
#max length of sequence of tokens in each chunk
max_length = 256

#number of training examples used in each iteration
batch_size = 32
num_folds = 5
num_epochs = 2

pad_on_right = tokenizer.padding_side == "right"

In [18]:
train_features = []
for _, row in train.iterrows():
    train_features += prepare_train_features(row, tokenizer)

TypeError: when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.

In [19]:
train_dataset = Dataset.from_pandas(train_features)

tokenized_train_ds = train_dataset.map(
    prepare_train_features, 
    batched=True, 
    remove_columns=train_dataset.column_names
)

AttributeError: 'list' object has no attribute 'columns'

In [10]:
test_dataset = Dataset.from_pandas(test)

tokenized_test_ds = test_dataset.map(
    prepare_validation_features, 
    batched=True, 
    remove_columns=test_dataset.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

model = AutoModelForQuestionAnswering.from_pretrained("deepset/xlm-roberta-large-squad2")

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [12]:
trainer.train()
trainer.save_model(output_dir)

ValueError: Trainer: training requires a train_dataset.

In [None]:
test_feats_small = tokenized_test_ds.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
test_feats_small

In [None]:
test_predictions = trainer.predict(test_feats_small)