In [1]:
# Importing libraries
import json
import pandas as pd
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
from configuration import Configuration
from configuration import CONSTANTS as C
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction
from rich.table import Column, Table
from rich import box
from rich.console import Console
from tensorboardX import SummaryWriter
import time
from torch import cuda
import glob

In [2]:


class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model
    """    
    def __init__(self, dataframe, tokenizer, source_len, target_len,answer_len, source_text, target_text,answer_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.ans_len = answer_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]
        self.answer_text = self.data[answer_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])
        answer_text = str(self.answer_text[index])
        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())
        answer_text = ' '.join(answer_text.split())
        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        answer = self.tokenizer.batch_encode_plus([answer_text], max_length= self.ans_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()
        answer_ids = answer['input_ids'].squeeze()
        answer_mask = answer['attention_mask'].squeeze()
        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long),
            'answer_ids': answer_ids.to(dtype=torch.long),
            'answer_mask': answer_mask.to(dtype=torch.long)
        }


def create_model_dir(experiment_main_dir, experiment_id, model_summary):
    """
    Create a new model directory.
    :param experiment_main_dir: Where all experiments are stored.
    :param experiment_id: The ID of this experiment.
    :param model_summary: A summary string of the model.
    :return: A directory where we can store model logs. Raises an exception if the model directory already exists.
    """
    model_name = "{}-{}".format(experiment_id, model_summary)
    model_dir = os.path.join(experiment_main_dir, model_name)
    if os.path.exists(model_dir):
        raise ValueError("Model directory already exists {}".format(model_dir))
    os.makedirs(model_dir)
    return model_dir

def train(epoch, tokenizer, model, device, loader, optimizer,writer,global_step,records,model_dir):

    """
    Function to be called for training with the parameters passed from main function

    """
    model.train()
    c=0
    for _,data in enumerate(loader, 0):
        print("mem",torch.cuda.memory_allocated(device=C.DEVICE))
        c=c+1
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        
        ans_str = data['answer_ids'].to(device, dtype = torch.long)
        ans_mask = data['answer_mask'].to(device, dtype = torch.long)
        
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids,
                        labels=lm_labels,answer_str=ans_str,answer_mask=ans_mask,tokenizer=tokenizer,c=c)
        loss = outputs[0]
        
        #print("preds",outputs["pred_ids"])
        #preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs["pred_ids"]]
        #print(preds)
        #print("ans",outputs["ans_ids"])
        #an = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in outputs["ans_ids"]]
        #print(an)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar("loss", loss, global_step)
        
        
        ### measure bleu
        if c%10==0:
            model.eval()
            predictions = []
            actuals = []
            num_dist=[]
            ##outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=3,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True,
            num_return_sequences=3,
              )
            print(generated_ids.shape)
            print(generated_ids)
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=False)for t in y]
            print(preds)
            print(target)
            predictions.extend(preds)
            for tt in target:
                print(tt)
                actuals.extend([tt,tt,tt])
                num_dist.extend([1,2,3])
                print("actualslen",len(actuals))
                print(actuals)
            print(len(actuals))
            print(len(predictions))
            print(num_dist)
            temp_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals,'Num distractor':num_dist})
            print(temp_df.head())
            val=records.rename(columns={'distractor':'Actual Text'})

            gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text','Num distractor']]

            distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

            dist_compare=distractors.merge(gen_dist,on=['text'])
            dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()
            dist_compare=dist_compare.assign(bleu1=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
            dist_compare=dist_compare.assign(bleu2=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
            dist_compare=dist_compare.assign(bleu3=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method1),axis=1))
            dist_compare=dist_compare.assign(bleu4=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method1),axis=1))
            
            for i in range(1,4):
                bleu_1=dist_compare.loc[dist_compare['Num distractor']==i].bleu1.mean()
                bleu_2=dist_compare.loc[dist_compare['Num distractor']==i].bleu2.mean()
                bleu_3=dist_compare.loc[dist_compare['Num distractor']==i].bleu3.mean()
                bleu_4=dist_compare.loc[dist_compare['Num distractor']==i].bleu4.mean()
                writer.add_scalar('bleu/distractor_{}/bleu_1'.format(i), bleu_1, global_step)
                writer.add_scalar('bleu/distractor_{}/bleu_2'.format(i), bleu_2, global_step)
                writer.add_scalar('bleu/distractor_{}/bleu_3'.format(i), bleu_3, global_step)
                writer.add_scalar('bleu/distractor_{}/bleu_4'.format(i), bleu_4, global_step)
            
            
            bleu_1=dist_compare.bleu1.mean()
            bleu_2=dist_compare.bleu2.mean()
            bleu_3=dist_compare.bleu3.mean()
            bleu_4=dist_compare.bleu4.mean()
            writer.add_scalar("bleu/distractor_gen/bleu_1", bleu_1, global_step)
            writer.add_scalar("bleu/distractor_gen/bleu_2", bleu_2, global_step)
            writer.add_scalar("bleu/distractor_gen/bleu_3", bleu_3, global_step)
            writer.add_scalar("bleu/distractor_gen/bleu_4", bleu_4, global_step)
            
            if c%1000==0:
                path = os.path.join(model_dir, "model_files")
                model.save_pretrained(path)
                tokenizer.save_pretrained(path)

            model.train()

        
        
        global_step += 1
    return global_step


def validate(epoch, tokenizer, model, device, loader,writer):

    """
    Function to evaluate model for predictions

    """
    global_step = 0
    model.eval()
    predictions = []
    actuals = []
    num_dist=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=3,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True,
                num_return_sequences=3,
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=False)for t in y]
            predictions.extend(preds)
            for tt in target:
                actuals.extend([tt,tt,tt])
                num_dist.extend([1,2,3])

        temp_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals,'Num distractor':num_dist})
        val=records.rename(columns={'distractor':'Actual Text'})

        gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text','Num distractor']]

        distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

        dist_compare=distractors.merge(gen_dist,on=['text'])
        dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()
        dist_compare=dist_compare.assign(bleu1=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu2=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu3=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu4=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method1),axis=1))

        for i in range(1,4):
            bleu_1=dist_compare.loc[dist_compare['Num distractor']==i].bleu1.mean()
            bleu_2=dist_compare.loc[dist_compare['Num distractor']==i].bleu2.mean()
            bleu_3=dist_compare.loc[dist_compare['Num distractor']==i].bleu3.mean()
            bleu_4=dist_compare.loc[dist_compare['Num distractor']==i].bleu4.mean()
            writer.add_scalar('val/bleu/distractor_{}/bleu_1'.format(i), bleu_1, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_2'.format(i), bleu_2, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_3'.format(i), bleu_3, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_4'.format(i), bleu_4, global_step)


        bleu_1=dist_compare.bleu1.mean()
        bleu_2=dist_compare.bleu2.mean()
        bleu_3=dist_compare.bleu3.mean()
        bleu_4=dist_compare.bleu4.mean()
        writer.add_scalar("val/bleu/distractor_gen/bleu_1", bleu_1, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_2", bleu_2, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_3", bleu_3, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_4", bleu_4, global_step)
    
            
    return dist_compare



def main(config):
    model_params={
        "MODEL":"t5-small",             # model_type: t5-base/t5-large
        "TRAIN_BATCH_SIZE":2,          # training batch size
        "VALID_BATCH_SIZE":2,          # validation batch size
        "TRAIN_EPOCHS":2,              # number of training epochs
        "VAL_EPOCHS":1,                # number of validation epochs
        "LEARNING_RATE":1e-4,          # learning rate
        "MAX_SOURCE_TEXT_LENGTH":300,  # max length of source text
        "MAX_TARGET_TEXT_LENGTH":301,   # max length of target text
        "MAX_ANSWER_LENGTH":300,   # max length of answer text
        "SEED": 42                     # set seed for reproducibility 

    }


    source_text='text'
    target_text='distractor'
    answer_text='answer_text'
    model_params=model_params

    with open(os.path.join(C.DATA_DIR, "distractor/race_train_original.json"), 'r') as content_file:
        content = content_file.read()
    content=content.replace('\n',',')
    content='['+content[:-1]+']'
    records = json.loads(content)
    records=pd.DataFrame(records)
    
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True


    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(C.DEVICE)
    
    ## format the input
    records=records.assign(question=records.question.str.join(' '))
    records=records.assign(distractor=records.distractor.str.join(' '))
    records=records.assign(article=records.article.str.join(' '))
    records=records.assign(answer_text=records.answer_text.str.join(' '))
    records=records.loc[:,['article','question','answer_text','distractor']]
    records=records.assign(text="dist q: "+records.question+" a: "+records.answer_text+" p: "+records.article)
    records=records.loc[:,['text','distractor','answer_text']]

    with open(os.path.join(C.DATA_DIR, "distractor/race_dev_original.json"), 'r') as content_file:
        content = content_file.read()
    content=content.replace('\n',',')
    content='['+content[:-1]+']'
    records_test = json.loads(content)
    records_test=pd.DataFrame(records_test)

    ## format the input
    records_test=records_test.assign(question=records_test.question.str.join(' '))
    records_test=records_test.assign(distractor=records_test.distractor.str.join(' '))
    records_test=records_test.assign(article=records_test.article.str.join(' '))
    records_test=records_test.assign(answer_text=records_test.answer_text.str.join(' '))
    records_test=records_test.loc[:,['article','question','answer_text','distractor']]
    records_test=records_test.assign(text="dist q: "+records_test.question+" a: "+records_test.answer_text+" p: "+records_test.article)
    records_test=records_test.loc[:,['text','distractor','answer_text']]

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    val_dataset=records_test
    train_dataset = records


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)
    val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)



    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])
    
    # Create Tensorboard logger.
    experiment_id = int(time.time())
    experiment_name = "name"
    model_dir = create_model_dir(os.path.join(C.DATA_DIR, "experiments/"), experiment_id, experiment_name)
        
    global_step = 0
    writer = SummaryWriter(os.path.join(model_dir, 'logs'))
    
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        global_step=train(epoch, tokenizer, model, C.DEVICE, training_loader, optimizer,writer,global_step,records,model_dir)

    #Saving the model after training
    path = os.path.join(model_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, C.DEVICE, val_loader,writer)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(model_dir, 'predictions.csv'),index=False)




In [3]:
def validate(epoch, tokenizer, model, device, loader,writer):

    """
    Function to evaluate model for predictions

    """
    global_step = 0
    model.eval()
    predictions = []
    actuals = []
    num_dist=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=3,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True,
                num_return_sequences=3,
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=False)for t in y]
            predictions.extend(preds)
            for tt in target:
                actuals.extend([tt,tt,tt])
                num_dist.extend([1,2,3])

        temp_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals,'Num distractor':num_dist})
        val=records.rename(columns={'distractor':'Actual Text'})

        gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text','Num distractor']]

        distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

        dist_compare=distractors.merge(gen_dist,on=['text'])
        dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()
        dist_compare=dist_compare.assign(bleu1=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu2=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu3=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method1),axis=1))
        dist_compare=dist_compare.assign(bleu4=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method1),axis=1))

        for i in range(1,4):
            bleu_1=dist_compare.loc[dist_compare['Num distractor']==i].bleu1.mean()
            bleu_2=dist_compare.loc[dist_compare['Num distractor']==i].bleu2.mean()
            bleu_3=dist_compare.loc[dist_compare['Num distractor']==i].bleu3.mean()
            bleu_4=dist_compare.loc[dist_compare['Num distractor']==i].bleu4.mean()
            writer.add_scalar('val/bleu/distractor_{}/bleu_1'.format(i), bleu_1, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_2'.format(i), bleu_2, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_3'.format(i), bleu_3, global_step)
            writer.add_scalar('val/bleu/distractor_{}/bleu_4'.format(i), bleu_4, global_step)


        bleu_1=dist_compare.bleu1.mean()
        bleu_2=dist_compare.bleu2.mean()
        bleu_3=dist_compare.bleu3.mean()
        bleu_4=dist_compare.bleu4.mean()
        writer.add_scalar("val/bleu/distractor_gen/bleu_1", bleu_1, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_2", bleu_2, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_3", bleu_3, global_step)
        writer.add_scalar("val/bleu/distractor_gen/bleu_4", bleu_4, global_step)
            
    return predictions, actuals



In [4]:

model_params={
    "MODEL":"t5-small",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":2,          # training batch size
    "VALID_BATCH_SIZE":2,          # validation batch size
    "TRAIN_EPOCHS":2,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":900,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":901,   # max length of target text
    "MAX_ANSWER_LENGTH":900,   # max length of answer text
    "SEED": 42                     # set seed for reproducibility 

}


source_text='text'
target_text='distractor'
answer_text='answer_text'
model_params=model_params

with open(os.path.join(C.DATA_DIR, "distractor/race_train_original.json"), 'r') as content_file:
    content = content_file.read()
content=content.replace('\n',',')
content='['+content[:-1]+']'
records = json.loads(content)
records=pd.DataFrame(records)

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(model_params["SEED"]) # pytorch random seed
np.random.seed(model_params["SEED"]) # numpy random seed
torch.backends.cudnn.deterministic = True


# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
model = model.to(C.DEVICE)

## format the input
records=records.assign(question=records.question.str.join(' '))
records=records.assign(distractor=records.distractor.str.join(' '))
records=records.assign(article=records.article.str.join(' '))
records=records.assign(answer_text=records.answer_text.str.join(' '))
records=records.loc[:,['article','question','answer_text','distractor']]
records=records.assign(text="dist q: "+records.question+" a: "+records.answer_text+" p: "+records.article)
records=records.loc[:,['text','distractor','answer_text']]

with open(os.path.join(C.DATA_DIR, "distractor/race_dev_original.json"), 'r') as content_file:
    content = content_file.read()
content=content.replace('\n',',')
content='['+content[:-1]+']'
records_test = json.loads(content)
records_test=pd.DataFrame(records_test)

## format the input
records_test=records_test.assign(question=records_test.question.str.join(' '))
records_test=records_test.assign(distractor=records_test.distractor.str.join(' '))
records_test=records_test.assign(article=records_test.article.str.join(' '))
records_test=records_test.assign(answer_text=records_test.answer_text.str.join(' '))
records_test=records_test.loc[:,['article','question','answer_text','distractor']]
records_test=records_test.assign(text="dist q: "+records_test.question+" a: "+records_test.answer_text+" p: "+records_test.article)
records_test=records_test.loc[:,['text','distractor','answer_text']]

# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest for validation. 
val_dataset=records_test
train_dataset = records


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)
val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)



# Defining the parameters for creation of dataloaders
train_params = {
  'batch_size': model_params["TRAIN_BATCH_SIZE"],
  'shuffle': True,
  'num_workers': 0
  }


val_params = {
  'batch_size': model_params["VALID_BATCH_SIZE"],
  'shuffle': False,
  'num_workers': 0
  }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])

In [4]:
def get_model_dir(experiment_dir, model_id):
    """Return the directory in `experiment_dir` that contains the given `model_id` string."""
    model_dir = glob.glob(os.path.join(experiment_dir, str(model_id) + "-*"), recursive=False)
    return None if len(model_dir) == 0 else model_dir[0]

def get_model_config(model_id):
    model_id = model_id
    model_dir = get_model_dir(os.path.join(C.DATA_DIR, "experiments/"), model_id)
    model_config = 0#Configuration.from_json(os.path.join(model_dir, 'config.json'))
    return model_config, model_dir

def load_model(model_id):
    model_config, model_dir = get_model_config(model_id)
    path = os.path.join(model_dir, "model_files")
    tokenizer = T5Tokenizer.from_pretrained(path)

    model = T5ForConditionalGeneration.from_pretrained(path)

    model.to(C.DEVICE)

    return model,tokenizer, model_config, model_dir

In [28]:
model,tokenizer, model_config, model_dir = load_model(1625154455)


In [8]:
writer = SummaryWriter(os.path.join(model_dir, 'logs'))

In [None]:
for epoch in range(model_params["VAL_EPOCHS"]):
    final_df = validate(epoch, tokenizer, model, C.DEVICE, val_loader,writer)
    final_df.to_csv(os.path.join(model_dir, 'predictions.csv'),index=False)

In [29]:
final_df=pd.read_csv(os.path.join(model_dir, 'predictions.csv'))

FileNotFoundError: [Errno 2] No such file or directory: '/cluster/home/fgonzalez/nlp/data/experiments/1625154455-name/predictions.csv'

In [11]:
final_df.mean()

Num distractor    1.999952
bleu1             0.229085
bleu2             0.076467
bleu3             0.041808
bleu4             0.030668
dtype: float64

In [None]:
1625497725    lambda 0.3 normal cosine
1625497726    lambda 0.4 normal cosine
1625497756    lambda 0.2 normal cosine
1625665307    lambda 0.1 normal cosine

In [24]:
## 1625497726 0.4
final_df.groupby(['Num distractor']).mean()

Unnamed: 0_level_0,bleu1,bleu2,bleu3,bleu4
Num distractor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.253613,0.090932,0.051882,0.038624
2,0.246943,0.08284,0.046607,0.034424
3,0.195244,0.063403,0.03349,0.025785


In [13]:
## 1625497725 0.3
final_df.groupby(['Num distractor']).mean()

Unnamed: 0_level_0,bleu1,bleu2,bleu3,bleu4
Num distractor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.254536,0.090609,0.051279,0.037286
2,0.248582,0.081285,0.043693,0.032288
3,0.18408,0.057492,0.030443,0.022425


In [27]:
## 1625497756 0.2
final_df.groupby(['Num distractor']).mean()

Unnamed: 0_level_0,bleu1,bleu2,bleu3,bleu4
Num distractor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.23726,0.083794,0.047737,0.034073
2,0.242868,0.080669,0.044823,0.03327
3,0.200969,0.065399,0.03527,0.026071


In [21]:
# 1625665307
final_df.groupby(['Num distractor']).mean()

Unnamed: 0_level_0,bleu1,bleu2,bleu3,bleu4
Num distractor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.23035,0.082875,0.048152,0.0345
2,0.251306,0.086232,0.047511,0.03511
3,0.212373,0.07087,0.038128,0.028427


In [None]:
## 1625154455


In [14]:
records[records.text=='dist q: McCulloch and his group used_in their research . a: 5 dogs and 169 people p: Dogs have long been used to find explosives and drugs . Now , a new study shows that man \'s best friend can also help to find lung and breast cancer , researchers report in integrative Cancer Therapies . The findings show that trained ordinary household dogs can detect early -- stage lung and breast cancers by smelling the breath samples of patients . Researchers have found that cancer cells send out molecules different from those of healthy ones , and that might be sensed by smell by the highly sensitive dog \'s nose . For the study , five dogs were trained by a professional instructor to respond differently to breath samples of healthy and cancer patients."The dogs learned to sit or lie down in front of cancer patient samples and to ignore control samples through the method of food reward , " McCulloch explained . After a period of training , researchers tested the animals\'ability to distinguish cancer patients from controls . The animals were given breath samples from 55 patients with lung cancer,3 1 with breast cancer and 83 healthy controls who were not included in the original training period . McCulloch \'s group found that the dogs were able to correctly distinguish the breath samples of cancer patients from those of the control subjects in about 90 percent of the cases . The dogs were also able to detecting early - stage lung and breast cancers . " These results show that there is hope for early detection,"McCulloch said . The re - searches are planning to conduct further studies on the breath composition of cancer patients to possibly design an electronic device that can do the dogs\'job."A dog \'s nose is so powerful it can detect odors 10 000 to 100 000 times better than a human nose can . I hope people will be interested in doing this research,"McCulloch added,"It shows that there is definitely something out there . "']

Unnamed: 0,text,distractor


In [14]:
temp_df = final_df

val=records_test_fil.rename(columns={'distractor':'Actual Text'})

gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text']].drop_duplicates()

distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

dist_compare=distractors.merge(gen_dist,on=['text'])

In [15]:
dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()

In [22]:
b1=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b2=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b3=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b4=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method1),axis=1)
dist_compare=dist_compare.assign(bleu1=b1)
dist_compare=dist_compare.assign(bleu2=b2)
dist_compare=dist_compare.assign(bleu3=b3)
dist_compare=dist_compare.assign(bleu4=b4)
#dist_compare=dist_compare.assign(bleu=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0)),axis=1))
bleu_1=dist_compare.bleu1.mean()
bleu_2=dist_compare.bleu2.mean()
bleu_3=dist_compare.bleu3.mean()
bleu_4=dist_compare.bleu4.mean()

In [21]:
bleu_3

0.07887480099137983

In [69]:
dist_compare.sort_values('bleu')

Unnamed: 0,text,Actual Text,Generated Text,bleu
0,"dist q: "" ... Old is suddenly in "" ( Line 1 , ...","[[America, has, suddenly, become, a, nation, o...","["", _, ""]",0.0
3715,dist q: The author takes Albert Einstein as an...,"[[tell, the, fact, that, Einstein, was, well, ...","[a, messy, shop, front]",0.0
3716,dist q: The author takes Albert Einstein as an...,"[[tell, the, fact, that, Einstein, was, well, ...","[a, messy, shop, front]",0.0
3738,dist q: The author used wolves as an example t...,"[[explain, the, cruel, side, of, group, -, liv...","[wolves, as, a, model, for, developing, and, m...",0.0
3739,dist q: The author used wolves as an example t...,"[[explain, the, cruel, side, of, group, -, liv...","[wolves, as, a, model, for, developing, and, m...",0.0
...,...,...,...,...
1981,dist q: If you are interested in history you w...,"[[Rocky, Mountain, National, Park], [Black, Ca...","[Rocky, Mountain, National, Park]",1.0
1982,dist q: If you are interested in history you w...,"[[Rocky, Mountain, National, Park], [Black, Ca...","[Rocky, Mountain, National, Park]",1.0
93,dist q: A girl who likes yoga will go to a: Pe...,"[[Camp, Jano, India], [Bay, Language, Academy]]","[Camp, Jano, India]",1.0
5075,dist q: The time of the recognition span can b...,"[[one, 's, purpose, in, reading], [lighting, a...","[lighting, and, tiredness]",1.0


In [100]:
dist_compare[dist_compare.bleu>0.5].sort_values('bleu')

Unnamed: 0,text,Actual Text,Generated Text,bleu
3236,dist q: What would be the best title for the p...,"[[Music, of, the, Mission, District], [The, Sp...","[The, Mission, District]",0.513417
2416,dist q: The research program is chiefly design...,"[[high, school, advisers, from, Houston], [col...","[high, school, students]",0.513417
2869,dist q: We can judge from the Deloitte study t...,"[[the, French, are, less, willing, to, buy, ec...","[French, holiday, shoppers, are, choosing, mor...",0.526316
778,dist q: From the passage we can see that a: 99...,"[[in, Europe, 94, billion, cubic, meters, of, ...","[94, billion, cubic, meters, of, methane, per,...",0.530714
445,"dist q: As a result of the ticketing mistake ,...","[[will, not, enjoy, the, synchronized, swimmin...","[the, men's, 100, m, final]",0.536256
...,...,...,...,...
2813,dist q: We can find the introduction to a prod...,"[[Part, 1], [Part, 2], [Part, 3]]","[Part, 3]",1.000000
3069,dist q: What did Paul have for dinner ? a: The...,"[[The, soup, and, the, duck], [The, duck, ,, t...","[The, duck, and, the, soup]",1.000000
3096,dist q: What does Sue like ? a: Playing games ...,"[[Swimming, and, reading]]","[Swimming, and, reading]",1.000000
1368,dist q: Lisa in this passage is the name of a:...,"[[a, male, lion], [a, pride]]","[a, male, lion]",1.000000


In [101]:
dist_compare.iloc[2869]

text              dist q: We can judge from the Deloitte study t...
Actual Text       [[the, French, are, less, willing, to, buy, ec...
Generated Text    [French, holiday, shoppers, are, choosing, mor...
bleu                                                       0.526316
Name: 2869, dtype: object

In [102]:
dist_compare.iloc[2869]['text']

'dist q: We can judge from the Deloitte study that a: over a quarter of the French give second - hand Christmas gifts p: A used book or nearly - new kitchen gadget    may not be at the top of every Christmas wish list , but hard economic times coupled with a new green awareness are changing attitudes about gift - giving in France . French holiday shoppers are choosing larger numbers for " green " gifting this Christmas , studies show . About 30 percent of French consumers will give second - hand items as gifts to stretch out their tight budgets but also to do their little bit for recycling , according to a study by international consulting firm Deloitte . The survey of Christmas consumer behaviors in 18 European countries found the French were more than twice as likely as other Europeans to give second - hand items . Websites promoting re - gifting and green gifting are popular in France , with many reporting a rise in business . " Concerns about the ecology and the economy have come t

In [103]:
dist_compare.iloc[2869]['Actual Text']

[['the',
  'French',
  'are',
  'less',
  'willing',
  'to',
  'buy',
  'eco',
  '-',
  'friendly',
  'gifts',
  'than',
  'other',
  'Europeans'],
 ['80',
  '%',
  'of',
  'French',
  'people',
  'are',
  'happy',
  'to',
  'receive',
  'second',
  '-',
  'hand',
  'gifts'],
 ['less',
  'than',
  '10',
  '%',
  'of',
  'European',
  'consumers',
  'are',
  'likely',
  'to',
  'give',
  'second',
  '-',
  'hand',
  'gifts']]

In [104]:
dist_compare.iloc[2869]['Generated Text']

['French',
 'holiday',
 'shoppers',
 'are',
 'choosing',
 'more',
 'than',
 'a',
 'quarter',
 'of',
 'the',
 'French',
 'consumers',
 'will',
 'give',
 'second',
 '-',
 'hand',
 'items']

In [48]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')

In [49]:
scores

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}

In [99]:
dist_compare.iloc[3751].text

"dist q: You should especially protect yourself from sun burnt while visiting a: American Watersports p: American Watersports     Tuesday - Saturday Located on the beach of the Sea Gardens Beach Resort , there are fun things to rent for the whole family . They offer rentals for kayaks , jet skis , and even parasail ! Enjoy the water up - close or from a bird 's eye view ! No matter what activity you 're enjoying , be sure to protect yourself and your family from the sun 's powerful rays and apply plenty of sun block ! 15thStreet Boat Company Monday - Saturday 15thStreet Boat Company offers rental boats of all kinds . They 're sure to have what you are looking for , whether it 's a small boat for a quick and simple outing or an extravagant boat with comfortable seats with a stereo and high tech navigation . You can rent a boat for half a day or a couple of days , or even weeks at a time . If you want it , they 've got it . It 's fun for everyone ! Coconut 's Watersports      9am-5pm Mon

In [96]:
dist_compare.sort_values('bleu')

Unnamed: 0,text,Actual Text,Generated Text,bleu
1701,dist q: The Dixie PIT program was introduced i...,"[raise money for school affairs, supply teache...",.,0.0
3554,dist q: Which of the following sentences from ...,"[In her calm , motherly voice she said,""By the...",'',0.0
37,"dist q: A "" property "" in Australia is a a: fa...",[school],farm,0.0
1026,dist q: If you want to be a member of Summer S...,[visit Black Rock Forest first],8455344517,0.0
3155,dist q: What is the best title for the text ? ...,[Put the glass down],"""",0.0
...,...,...,...,...
152,"dist q: According to the author , which of the...","[Western women, Chinese men]",Western women,1.0
3751,dist q: You should especially protect yourself...,"[Jet Ski Tours of Miami, 15 \n thStreet Boat C...",Jet Ski Tours of Miami,1.0
2218,dist q: The old woman wants to see a: Xiao Min...,[Xiao Ming],Xiao Ming,1.0
3147,dist q: What is the best title for the passage...,"[Pop music, Classical music, Folk music]",Classical music,1.0


In [23]:
dist_compare.iloc[675,:]['Actual Text']

['your eyes are poor',
 'your left eye is not open',
 'you move it close to your eye']

In [24]:
dist_compare.iloc[675,:]['text']

"dist q: You fail to see the letter L in the experiment because a: its image falls on the blind spot p: It seems to be strange to you there is a blind spot on the eyes , Here is an interesting experiment that can make something disappear , when one eye is open . Make a card about the size of a postcard and write two English letters L and R on it , L on the left and R on the right . First , hold the card about 80 cm away and you see both the letters . Then close your right eye and look at the letter R only with your left eye . And now , as you move the card slowly towards you , you'll find the letter L disappearing . But if you move the card nearer to your face , the letter will be seen again . Now do the same experiment with your left eye closed , you'll find the letter R disappearing . Why does the letter disappear ? It is because there is a blind spot on the eye . When the image of the letter falls on the blind spot , it wo n't be seen . That is why either of the letters disappears .

In [25]:

dist_compare=val.merge(gen_dist,on=['text'])
aa=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0)),axis=1)
dist_compare=dist_compare.assign(bleu=aa)
#dist_compare=dist_compare.assign(bleu=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0)),axis=1))
bleu_3=dist_compare.bleu.mean()

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [26]:
bleu_3

2.206983827137286e-308

## scores

In [23]:
bleu_1

0.21606667354214598

In [24]:
bleu_2

0.07887480099137983

In [25]:
bleu_3

0.04384669033424478

In [26]:
bleu_4

0.033407103702946084

In [None]:
if __name__ == '__main__':
    main(Configuration.parse_cmd())