In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from tqdm.auto import tqdm
from functools import partial
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset
from accelerate import Accelerator
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering,AutoModel,
                          AutoConfig,AdamW,get_linear_schedule_with_warmup,
                          get_cosine_schedule_with_warmup)


## Конфигурации модели

In [None]:
config = {'model_path':'../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2',
          
          'max_length':384,
          'doc_stride':128,
          'max_answer_length':30,
          
          'lr':1e-5,
          'wd':1e-2,
    
          'epochs':1,
          'nfolds':5,
          'batch_size':4,
          'num_workers':4,
          'seed':1000}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

## Загрузка данных

In [None]:
train_data = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test_data = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')

external_data1 = pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')
external_data2 = pd.read_csv('../input/mlqa-hindi-processed/xquad.csv')
train_data = pd.concat([train_data,external_data1,external_data2]).reset_index(drop=True)

sample = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')

train_data = pd.concat([train_data.query('language== "tamil"'),train_data.query('language == "hindi"').sample(n=368)]).reset_index(drop=True)

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=train_data['language'])):
    train_data.loc[valid_idx,'Fold'] = k

def convert_answers(r):
    return {'answer_start': [r[0]], 'text': [r[1]]}

train_data['answers'] = train_data[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

In [None]:
train_data.shape,train_data.language.value_counts()

((7729, 8),
 hindi    7361
 tamil     368
 Name: language, dtype: int64)

In [None]:
train_data.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language,Fold,answers
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil,4,"{'answer_start': [53], 'text': ['206']}"
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil,0,"{'answer_start': [2358], 'text': ['காசுமீரில்']}"
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil,1,"{'answer_start': [0], 'text': ['சர் அலெக்ஸாண்ட..."
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil,1,"{'answer_start': [68], 'text': ['தாலாட்டு']}"
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil,0,"{'answer_start': [585], 'text': ['சூரியனும்']}"


## Предобработка данных

In [None]:
def prepare_train_features(examples, tokenizer, pad_on_right, max_length, doc_stride):
    examples['question'] = [q.lstrip() for q in examples['question']]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length")
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

## Дообучение модели

In [None]:
class Model(nn.Module):
    def __init__(self,model_name):
        super(Model,self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.roberta = AutoModel.from_pretrained(model_name,config=config)
        self.roberta.pooler = nn.Identity()
        self.linear = nn.Linear(self.config.hidden_size,2)
        
    def loss_fn(self,start_logits,end_logits,start_positions,end_positions):
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)

        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)
        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = 0.75 * start_loss + 0.25 * end_loss 
        return total_loss
    
    def forward(self,**xb):
        x = self.roberta(input_ids=xb['input_ids'],attention_mask=xb['attention_mask'])[0]
        x = self.linear(x)
        
        start_logits,end_logits = x.split(1,dim=-1)
        start_logits,end_logits = start_logits.squeeze(-1).contiguous(),end_logits.squeeze(-1).contiguous()
        start_positions,end_positions = xb['start_positions'],xb['end_positions']
        
        loss = None
        if start_positions is not None and end_positions is not None:
            loss = self.loss_fn(start_logits, end_logits, start_positions, end_positions)
            
        return (start_logits,end_logits),loss

In [None]:
class ChaiiDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):            
        return {"input_ids": torch.tensor(self.data[idx]["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(self.data[idx]["attention_mask"], dtype=torch.long),
                "start_positions":torch.tensor(self.data[idx]["start_positions"],dtype=torch.long),
                "end_positions":torch.tensor(self.data[idx]["end_positions"],dtype=torch.long) }

## Запуск обучения

In [None]:
def run(fold):
    
    def evaluate(model,valid_loader):
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for i, inputs in enumerate(tqdm(valid_loader)):
                inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
                outputs = model(**inputs)
                loss = outputs[1]
                valid_loss += loss.item()

        valid_loss /= len(valid_loader)
        return valid_loss
        
    def train_and_evaluate_loop(train_loader,valid_loader,model,optimizer,
                                epoch,fold,best_loss,lr_scheduler=None):
        train_loss = 0
        for i, inputs in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            model.train()
            inputs = {key:val.reshape(val.shape[0],-1) for key,val in inputs.items()}
            outputs = model(**inputs)
            loss = outputs[1]
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            if lr_scheduler:
                lr_scheduler.step()
        
        train_loss /= len(train_loader)
        valid_loss = evaluate(model,valid_loader) 

        if valid_loss <= best_loss:
            print(f"Epoch:{epoch} |Train Loss:{train_loss}|Valid Loss:{valid_loss}")
            print(f"{g_}Loss Decreased from {best_loss} to {valid_loss}{sr_}")

            best_loss = valid_loss
            torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
            tokenizer.save_pretrained(f'./model{fold}')
                    
        return best_loss
        
    accelerator = Accelerator()
    print(f"{accelerator.device} is used")
    
    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")
        
    model = Model(config['model_path'])
    tokenizer = AutoTokenizer.from_pretrained(config['model_path'])
    pad_on_right = tokenizer.padding_side == 'right'
    
    train_dataset = Dataset.from_pandas(x_train)
    train_features = train_dataset.map(
                    partial(
                        prepare_train_features, 
                        tokenizer=tokenizer,
                        pad_on_right=pad_on_right, 
                        max_length=config['max_length'],
                        doc_stride=config['doc_stride']
                    ),
                    batched=True,
                    remove_columns=train_dataset.column_names)
        
    train_ds = ChaiiDataset(train_features)
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        num_workers = config['num_workers'],
                        shuffle=True,
                        pin_memory=True,
                        drop_last=True)
    

    valid_dataset = Dataset.from_pandas(x_valid)
    valid_features = valid_dataset.map(
                    partial(
                        prepare_train_features, 
                        tokenizer=tokenizer,
                        pad_on_right=pad_on_right, 
                        max_length=config['max_length'],
                        doc_stride=config['doc_stride']
                    ),
                    batched=True,
                    remove_columns=train_dataset.column_names)
        
    valid_ds = ChaiiDataset(valid_features)
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        num_workers = config['num_workers'],
                        shuffle=False,
                        pin_memory=True,
                        drop_last=False)

    optimizer = AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])    
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                   num_warmup_steps=0,
                                                   num_training_steps= config['epochs'] * len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    print(f"Fold: {fold}")
    best_loss = 9999
    start_time = time.time()
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,optimizer,epoch,fold,best_loss,lr_scheduler)
        
        end_time = time.time()
        print(f"{m_}Time taken by epoch {epoch} is {end_time-start_time:.2f}s{sr_}")
        start_time = end_time
        
    return best_loss

In [None]:
best_loss_per_fold = [run(f) for f in range(config['nfolds'])]

cuda is used


Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Fold: 0
Epoch Started:0


HBox(children=(FloatProgress(value=0.0, max=4587.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1180.0), HTML(value='')))


Epoch:0 |Train Loss:0.6577909169391483|Valid Loss:0.5328498437515656
[32mLoss Decreased from 9999 to 0.5328498437515656[0m
[35mTime taken by epoch 0 is 3067.15s[0m
cuda is used


Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Fold: 1
Epoch Started:0


HBox(children=(FloatProgress(value=0.0, max=4574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1194.0), HTML(value='')))


Epoch:0 |Train Loss:0.6679888667178521|Valid Loss:0.5453699062046223
[32mLoss Decreased from 9999 to 0.5453699062046223[0m
[35mTime taken by epoch 0 is 3061.06s[0m
cuda is used


Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Fold: 2
Epoch Started:0


HBox(children=(FloatProgress(value=0.0, max=4647.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1120.0), HTML(value='')))


Epoch:0 |Train Loss:0.6552185215104067|Valid Loss:0.5701640377656596
[32mLoss Decreased from 9999 to 0.5701640377656596[0m
[35mTime taken by epoch 0 is 3096.70s[0m
cuda is used


Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Fold: 3
Epoch Started:0


HBox(children=(FloatProgress(value=0.0, max=4608.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1160.0), HTML(value='')))


Epoch:0 |Train Loss:0.6655369259998325|Valid Loss:0.5339459364226435
[32mLoss Decreased from 9999 to 0.5339459364226435[0m
[35mTime taken by epoch 0 is 3077.57s[0m
cuda is used


Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Fold: 4
Epoch Started:0


HBox(children=(FloatProgress(value=0.0, max=4651.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1116.0), HTML(value='')))


Epoch:0 |Train Loss:0.650711741827575|Valid Loss:0.5566816597807526
[32mLoss Decreased from 9999 to 0.5566816597807526[0m
[35mTime taken by epoch 0 is 3097.21s[0m


In [None]:
print(best_loss_per_fold)
print(np.mean(best_loss_per_fold))

[0.5328498437515656, 0.5453699062046223, 0.5701640377656596, 0.5339459364226435, 0.5566816597807526]
0.5478022767850488


## Получение предсказаний

In [None]:
def get_prediction(df,model_paths,device='cuda'):
    start_logits = list()
    end_logits = list()
    
    for path,model_name in model_paths:
        model = Model(model_name)
        model.eval()
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        pad_on_right = tokenizer.padding_side == 'right'

        for f in range(config['nfolds']):
            model.load_state_dict(torch.load(path.format(f),map_location=device))
            model.to(device)
            model.eval()

            test_dataset = Dataset.from_pandas(df)
            test_features = test_dataset.map(
                            partial(
                                prepare_validation_features, 
                                tokenizer=tokenizer,
                                pad_on_right=pad_on_right, 
                                max_length=config['max_length'],
                                doc_stride=config['doc_stride']
                            ),
                            batched=True,
                            remove_columns=test_dataset.column_names)

            test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

            test_ds = ChaiiDataset(test_feats_small)
            test_dl = DataLoader(test_ds,
                                batch_size = config["batch_size"],
                                num_workers = config['num_workers'],
                                shuffle=False,
                                pin_memory=True,
                                drop_last=False)

            with torch.no_grad():
                pred = list()
                start_logit = list()
                end_logit = list()
                for i, inputs in enumerate(test_dl):
                    inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
                    outputs = model(**inputs)
                    start = outputs[0].detach().cpu().numpy().tolist()
                    end = outputs[1].detach().cpu().numpy().tolist()
                    start_logit.extend(start)
                    end_logit.extend(end)

            start_logits.append(start_logit)
            end_logits.append(end_logit)

    torch.cuda.empty_cache()
    start_logits, end_logits = np.mean(start_logits,axis=0), np.mean(end_logits,axis=0)

    fin_preds = postprocess_qa_predictions(test_dataset, tokenizer, test_features, (start_logits, end_logits))
    return fin_preds

In [None]:
model_paths = [
    ('../input/chaii-pytorch-xlmroberta-large/model{0}/model{0}.bin','../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'),
]

In [None]:
predictions = get_prediction(test_data,model_paths)

Some weights of the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


Post-processing 5 example predictions split into 67 features.


In [None]:
test_data['PredictionString'] = test_data['id'].map(predictions)

In [None]:
bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

tamil_ad = "கி.பி"
tamil_bc = "கி.மு"
tamil_km = "கி.மீ"
hindi_ad = "ई"
hindi_bc = "ई.पू"

cleaned_preds = []
for pred, context in test_data[["PredictionString", "context"]].to_numpy():
    if pred == "":
        cleaned_preds.append(pred)
        continue
    while any([pred.startswith(y) for y in bad_starts]):
        pred = pred[1:]
    while any([pred.endswith(y) for y in bad_endings]):
        if pred.endswith("..."):
            pred = pred[:-3]
        else:
            pred = pred[:-1]
    
    if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
        pred = pred+"."

    cleaned_preds.append(pred)

test_data["PredictionString"] = cleaned_preds

In [None]:
test_data[['id', 'PredictionString']].to_csv('submission.csv', index=False)