In [1]:
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/ -qq

In [2]:
import gc
import itertools
import pathlib

import pandas as pd
import numpy as np
import tensorflow as tf

import transformers

from chaii_config import *
from chaii_utils import *
from chaii_models import *

In [3]:
test_df = pd.read_csv(INPUT_DIR / "chaii-hindi-and-tamil-question-answering/test.csv")
test_df

Unnamed: 0,id,context,question,language
0,22bff3dec,"ज्वाला गुट्टा (जन्म: 7 सितंबर 1983; वर्धा, महा...",ज्वाला गुट्टा की माँ का नाम क्या है,hindi
1,282758170,गूगल मानचित्र (Google Maps) (पूर्व में गूगल लो...,गूगल मैप्स कब लॉन्च किया गया था?,hindi
2,d60987e0e,गुस्ताव रॉबर्ट किरचॉफ़ (१२ मार्च १८२४ - १७ अक्...,गुस्ताव किरचॉफ का जन्म कब हुआ था?,hindi
3,f99c770dc,அலுமினியம் (ஆங்கிலம்: அலுமினியம்; வட அமெரிக்க ...,அலுமினியத்தின் அணு எண் என்ன?,tamil
4,40dec1964,"கூட்டுறவு இயக்க வரலாறு, இங்கிலாந்து நாட்டில் ...",இந்தியாவில் பசுமை புரட்சியின் தந்தை என்று கருத...,tamil


In [4]:
raw_predictions = {}
tokenized_features = {}
for model_config in [XLM_ROBERTA_CONFIG, MURIL_CONFIG]:#, REMBERT_CONFIG]:
    print(f"== Making predictions for model {model_config['model_name']} ==")
    # Load tokenizer
    tokenizer_path = f"/kaggle/usr/lib/chaii_models/{model_config['model_name']}-tokenizer"
    if model_config["model_type"] == "bert":
        tokenizer = transformers.BertTokenizerFast.from_pretrained(tokenizer_path)
    elif model_config["model_type"] == "xlm_roberta":
        tokenizer = transformers.XLMRobertaTokenizerFast.from_pretrained(tokenizer_path)
    elif model_config["model_type"] == "rembert":
        tokenizer = transformers.RemBertTokenizerFast.from_pretrained(tokenizer_path)
    # Create Features
    test_features = prepare_validation_features(test_df, tokenizer, model_config)
    X_test = create_model_input(test_features, is_train=False)
    tokenized_features[model_config["model_name"]] = test_features
    del(tokenizer)
    gc.collect()
    # Make predictions
    raw_predictions_model = {}
    for fold in range(N_FOLDS):
        print(f"Fold {fold+1}/{N_FOLDS}")
        if model_config["model_name"] == "xlm-roberta-large":
            model_path = INPUT_DIR / f"xlm-roberta-large-squad2-f{fold}/xlm-roberta-f{fold}"
            model = tf.keras.models.load_model(model_path)
        elif model_config["model_name"] == "muril-large-cased":
            model_path = INPUT_DIR / f"{model_config['model_name']}-f{fold}/{model_config['model_name']}-f{fold}"
            model = tf.keras.models.load_model(model_path)
        elif model_config["model_name"] == "rembert":
            #load weights
            model_path = f"/kaggle/usr/lib/chaii_models/{model_config['model_name']}-model"
            weights_path = INPUT_DIR / f"{model_config['model_name']}-weights-f{fold}/{model_config['model_name']}-weights-f{fold}"
            model = tf.keras.models.load_model(model_path, custom_objects={'TFRemBertMainLayer': transformers.TFRemBertForQuestionAnswering})
            model.load_weights(weights_path)
        raw_predictions_model[fold] = model.predict(X_test)
        del(model)
        gc.collect()
    raw_predictions[model_config["model_name"]] = raw_predictions_model

== Making predictions for model xlm-roberta-large ==
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
== Making predictions for model muril-large-cased ==
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [5]:
raw_preds = {}

for model_name, fold_preds in raw_predictions.items():
    start_logits, end_logits = fold_preds[0]
    for f in range(1, N_FOLDS):
        start_logits += fold_preds[f][0]
        end_logits += fold_preds[f][1]
    start_logits /= N_FOLDS
    end_logits /= N_FOLDS
    raw_preds[model_name] = [start_logits, end_logits]
    
del(raw_predictions, raw_predictions_model)
gc.collect()

20

In [6]:
raw_preds

{'xlm-roberta-large': [array([[ 2.73534  , -7.989061 , -7.4293213, ..., -6.5310655, -7.915436 ,
          -9.432802 ],
         [ 5.638371 , -7.211671 , -7.719264 , ..., -5.976297 , -8.313581 ,
          -9.472136 ],
         [ 5.667095 , -7.2166014, -8.010153 , ..., -9.786278 , -9.786278 ,
          -9.786278 ],
         ...,
         [ 6.1071744, -7.7549467, -7.116937 , ..., -6.6521273, -9.648048 ,
          -9.053851 ],
         [ 6.285781 , -7.6651063, -6.63576  , ..., -4.6240473, -9.275125 ,
          -9.361959 ],
         [ 5.8782253, -8.234912 , -7.5576897, ..., -9.451858 , -9.451858 ,
          -9.451858 ]], dtype=float32),
  array([[ 3.9319954, -6.267231 , -8.383825 , ..., -4.3164663, -5.9400883,
          -7.5676546],
         [ 7.035922 , -5.720683 , -8.732808 , ..., -6.381985 , -6.20816  ,
          -7.9365225],
         [ 7.101883 , -5.776944 , -8.952247 , ..., -8.749241 , -8.749241 ,
          -8.749241 ],
         ...,
         [ 7.488763 , -7.49389  , -8.425776 , ..., -

In [7]:
def postprocess_predictions(examples_df, tokenized_features, raw_predictions, tokenizer, verbose=True):
    examples = examples_df.copy().reset_index(drop=True)
    all_start_probs, all_end_probs = raw_predictions
    
    # Map examples to its corresponding features
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, example_id in enumerate(tokenized_features["example_id"]):
        features_per_example[example_id_to_index[example_id]].append(i)    
    predictions = collections.OrderedDict()
    
    if verbose:
        print(f"Post-processing {examples.shape[0]} example predictions split into {len(tokenized_features['example_id'])} features.")
    for example_index, example in (tqdm(examples.iterrows(), total=examples.shape[0]) if verbose else examples.iterrows()):
        feature_indices = features_per_example[example_index]
        
        #min_null_score = None
        valid_answers = []
        
        context = example["context"]
        # Loop over all features associated with the example
        for feature_index in feature_indices:
            start_probs = all_start_probs[feature_index]
            end_probs = all_end_probs[feature_index]
            
            offset_mapping = tokenized_features["offset_mapping"][feature_index]
            
            # Update minimum null prediction
            #cls_index = tokenized_features["input_ids"][feature_index].index(tokenizer.cls_token_id)
            #feature_null_score = start_probs[cls_index] + end_probs[cls_index]
            #if min_null_score is None or min_null_score < feature_null_score:
            #    min_null_score = feature_null_score
            
            # Go through all possibilities for the N_BEST_SIZE greater start and end probs
            start_indices = np.argsort(start_probs)[-1: -N_BEST_SIZE-1: -1].tolist()
            end_indices = np.argsort(end_probs)[-1: -N_BEST_SIZE-1: -1].tolist()
            for start_index in start_indices:
                for end_index in end_indices:
                    # Dont consider out of scope indices, either because the indices are out of bounds or
                    # correspond to part of the input_ids that are not in the context
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Dont consider answers with length < 0 or > MAX_ANSWER_LENGTH
                    if end_index < start_index or end_index - start_index + 1 > MAX_ANSWER_LENGTH:
                        continue
                    
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    
                    
                    text = context[start_char: end_char].strip()
                    
                    
                    
                    valid_answers.append(
                        {
                            "score": start_probs[start_index] + end_probs[end_index],
                            "text": context[start_char: end_char].strip(),
                        }
                    )
            
        if len(valid_answers) == 0:
            valid_answers.append({"score": 0.0, "text": ""})
        
        #if not SQUAD_V2:
        predictions[example["id"]] = valid_answers
        #else:
        #    anwer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        #    predictions[example["id"]] = answer
            
    return predictions

In [8]:
def clean_predictions(model_predictions, test_data, n=20):
    bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
    bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

    tamil_ad = "கி.பி"
    tamil_bc = "கி.மு"
    tamil_km = "கி.மீ"
    hindi_ad = "ई"
    hindi_bc = "ई.पू"
    
    cleaned_predictions = collections.OrderedDict()
    
    for example_id in model_predictions:
        # keep only the top n predictions
        top_preds = sorted(model_predictions[example_id], key=lambda x: x["score"], reverse=True)[:n]
        context = test_data[test_data["id"] == example_id].values[0]
        cleaned_preds = []
        for prediction in top_preds:
            pred = prediction["text"]
            score = prediction["score"]
            if pred == "":
                cleaned_preds.append(prediction)
                continue
            while any([pred.startswith(y) for y in bad_starts]):
                pred = pred[1:]
            while any([pred.endswith(y) for y in bad_endings]):
                if pred.endswith("..."):
                    pred = pred[:-3]
                else:
                    pred = pred[:-1]
            if pred.endswith("..."):
                    pred = pred[:-3]
                    
            if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_bc)]) and pred+"." in context:
                pred = pred+"."
                
            cleaned_preds.append({"text": pred, "score": score})
                
        cleaned_predictions[example_id] = cleaned_preds
    
    return cleaned_predictions

In [9]:
n = 20
predictions = {}

for model_name in raw_preds:
    preds = postprocess_predictions(test_df, tokenized_features[model_name], raw_preds[model_name], tokenizer=None, verbose=True)
    predictions[model_name] = clean_predictions(preds, test_df, n)
    
del(raw_preds, tokenized_features)
gc.collect()

Post-processing 5 example predictions split into 67 features.


  0%|          | 0/5 [00:00<?, ?it/s]

Post-processing 5 example predictions split into 52 features.


  0%|          | 0/5 [00:00<?, ?it/s]

0

In [10]:
best_preds = {}
for model_name, model_preds in predictions.items():
    best_preds[model_name] = {}
    for id_, preds in model_preds.items():
        best_preds[model_name][id_] = {pred["text"]: i+1 for i, pred in enumerate(sorted(preds, key=lambda x: x["score"], reverse=False)[-n:])}
del(predictions)
gc.collect()
ensembled_preds = {}
for id_ in test_df["id"]:
    all_preds = []
    for model_name in best_preds:
        all_preds += list(best_preds[model_name][id_].keys())
    all_preds = list(set(all_preds))
    pred_votes = collections.defaultdict(int)
    for pred in all_preds:
        for model_name in best_preds:
            try:
                pred_votes[pred] += best_preds[model_name][id_][pred]
            except KeyError:
                pass
    ensembled_preds[id_] = sorted(pred_votes.items(), key=lambda x: x[1], reverse=True)[0][0]
    
del(all_preds, best_preds, pred_votes)
gc.collect()

ensembled_preds

{'22bff3dec': 'येलन',
 '282758170': '28 नवम्बर 2007',
 'd60987e0e': '१२ मार्च १८२४',
 'f99c770dc': '13',
 '40dec1964': 'சுவாமிநாதன்'}

In [11]:
submission = pd.DataFrame(test_df["id"])
submission["PredictionString"] = submission["id"].apply(lambda x: ensembled_preds[x])
submission

Unnamed: 0,id,PredictionString
0,22bff3dec,येलन
1,282758170,28 नवम्बर 2007
2,d60987e0e,१२ मार्च १८२४
3,f99c770dc,13
4,40dec1964,சுவாமிநாதன்


In [12]:
submission.to_csv("submission.csv", index=False)