In [2]:
#!pip install transformers

In [3]:
import torch
import random
from datetime import datetime
import pandas as pd 
import os
import pickle
import json
import numpy as np
from tqdm import tqdm
from functools import partial
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
from timeit import default_timer as timer
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW

## Load Functions

In [4]:
def extract_most_probable(start_scores: torch.Tensor, end_scores: torch.Tensor):
    """
    Given a batch of start and end of span scores (logits), returns the
    most probable (start, end) indexes for each sample in the batch
    such that end >= start.

    Args:
        start_scores (torch.Tensor): start position scores of shape (B, T)
            of for each token position.
        end_scores (torch.Tensor): end position scores of shape (B, T)
            of for each token position.
    Returns:
        start_end_indexes (Tuple[torch.Tensor, torch.Tensor]): tuple of 
            tensors containing start and end indexes respectively for each
            sample.
    """

    # extract shapes
    batch_dim, timestep_dim = start_scores.shape
    # compute marginal distributions for start and end
    start_probs = torch.nn.functional.softmax(start_scores, dim=1)
    end_probs = torch.nn.functional.softmax(end_scores, dim=1)
    # compute start_end joint distribution
    joint_dist_start_end = start_probs[:, :, None] @ end_probs[:, None, :]
    constrained_joint_dist = torch.triu(joint_dist_start_end)
    # compute the actual indexes
    flattened_distr_argmax = constrained_joint_dist.view(batch_dim, -1).argmax(1).view(-1, 1)
    start_end_idxs = torch.cat((flattened_distr_argmax // timestep_dim, flattened_distr_argmax % timestep_dim), dim=1).cpu().detach()
    return (start_end_idxs[:, 0], start_end_idxs[:, 1])

In [5]:
def splitDataFrame(data,ratio=0.7,seed=42,limit=None):
    ar = data.conversation_id.unique()
    if limit:
        ar = ar[:limit]
    random.shuffle(ar)
    train_count = int(ar.shape[0]* 0.7)
    train_ar = ar[:train_count]
    val_ar = ar[train_count:]
    train_df = data[data.conversation_id.isin(train_ar)].reset_index(drop=True)
    val_df = data[data.conversation_id.isin(val_ar)].reset_index(drop=True)
    return train_df,val_df

In [6]:
def bert_padder_collate_fn(sample_list):
    # NOTE: the tokenizer in dataloader already pads inputs to have same length of 384
    input_ids_padded = [sample["input_ids"] for sample in sample_list]
    attention_mask_padded = [sample["attention_mask"] for sample in sample_list]
    out = [sample["out_span"] for sample in sample_list]
    # Convert inputs to Torch tensors
    input_ids_padded = torch.tensor(input_ids_padded, dtype=torch.long)
    attention_mask_padded = torch.tensor(attention_mask_padded, dtype=torch.long)
    # Tensor adds an extra dimension, so remove it
    input_ids_padded = input_ids_padded[:, 0, :]
    attention_mask_padded = attention_mask_padded[:, 0, :]
    return {"input_ids": input_ids_padded,
            "attention_mask": attention_mask_padded,
            "y_gt":torch.stack(out)}

In [7]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384, doc_stride=128):
    pad_on_right = tokenizer.padding_side == "right"
    # Process the sample
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

## Load Classes

In [8]:
class ParametricBertModelQA(torch.nn.Module):
    def __init__(self,model_name):
        super(ParametricBertModelQA,self).__init__()
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    
    def prepare_input_fn(self,inputs, device):
        model_input = {}
        model_input["input_ids"] = inputs["input_ids"].to(device)
        model_input["attention_mask"] = inputs["attention_mask"].to(device)
        return model_input
    
    def forward(self,inputs):
        curr_device = self.model.device
        input_dict = self.prepare_input_fn(inputs,curr_device)
        output = self.model(**input_dict)
        return (output.start_logits,output.end_logits)

In [9]:
class CustomQADatasetBERT(torch.utils.data.Dataset):
    """Custom text dataset for Huggingface BERT models."""

    def __init__(self, tokenizer_fn, df):
        super(CustomQADatasetBERT, self).__init__()
        self.input_list = df[["paragraph", "question_text", "question_id"]]
        self.output_list = df[["tokenizer_answer_start", "tokenizer_answer_end"]]
        self.tokenizer_fn = tokenizer_fn

    def __len__(self):
        return len(self.input_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        paragraph_text = self.input_list.iloc[idx]["paragraph"]
        question_id = self.input_list.iloc[idx]["question_id"]
        question_text = self.input_list.iloc[idx]["question_text"]
        tokenizer_answer_start = self.output_list.iloc[idx]["tokenizer_answer_start"]
        tokenizer_answer_end = self.output_list.iloc[idx]["tokenizer_answer_end"]

        tokenized_input_pair = self.tokenizer_fn(question_text, paragraph_text)
        
        input_ids = tokenized_input_pair["input_ids"]
        attention_mask = tokenized_input_pair["attention_mask"]

        out_span = torch.tensor([tokenizer_answer_start, tokenizer_answer_end])
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "out_span": out_span,
            "question":question_text
        }

In [12]:
class ModelUtils:
    def __init__(self,model,create=False):
        cur_drive = 'drive/MyDrive/'
        if not create:
            self.model_path = os.path.join(cur_drive,model)
        else:
            self.dir_path = os.path.join(cur_drive,model)
            self.model_path = self._create_dir()
         
    def save_params(self,params_dict):
        with open(os.path.join(self.model_path,'params_dict.pkl'),'wb') as fp:
            pickle.dump(params_dict, fp)
    
    def _create_dir(self):
        #check if dir exist and create if not
        addPath = datetime.now().strftime("%Y_%m_%d_%H%M%S")
        if not os.path.exists(self.dir_path):
            os.makedirs(self.dir_path)
        model_path = os.path.join(self.dir_path,addPath)
        os.makedirs(model_path)
        return model_path
    
    def _saveResult(self,resJson):
        with open(os.path.join(self.model_path,'training_stats.json'),'w') as fp:
            import json
            fp.write(json.dumps(resJson))

    def save_checkpoints(self,state,isBest=False):
        """
        This func saves below details -
        * trained model 
        * hyperparameter
        """
        filename = 'best_ckps.pt' if isBest else 'last_ckps.pt'
        torch.save(state,os.path.join(self.model_path,filename))
        
    def save_best_model(self,model):
        checkpoint = torch.load(f'{self.model_path}/best_ckps.pt')
        model.load_state_dict(checkpoint['state_dict'])
        model.model.save_pretrained(f'{self.model_path}/')
        print(f'----- model is saved at {self.model_path} ------')

In [45]:
class QAModel:
    def __init__(self,train_args=None,load_args=None):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        if load_args:
            self.utilsObj = ModelUtils(load_args['model_ckp'])
            with open(os.path.join(self.utilsObj.model_path,'params_dict.pkl'),'rb') as fp:
                self.params_dict = pickle.load(fp)
            self.model,self.optimizer = self.loadModel(load_model=True)
            self.tokenizer_dict = self.loadTokenizer(load_model=True)
            if load_args['ckp_status']:
                ckp_path = os.path.join(self.utilsObj.model_path,f'{load_args["ckp_status"]}_ckps.pt')
                checkpoint = torch.load(ckp_path)
                self.model.load_state_dict(checkpoint['state_dict'])
                self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                self.epoch_start = checkpoint['epoch']
                self.val_min_loss = checkpoint['validation_loss']
        else:
            self.params_dict = train_args['params_dict']
            self.utilsObj = ModelUtils(train_args['folder_name'],create=True)
            self.tokenizer_dict = self.loadTokenizer()
            self.model,self.optimizer = self.loadModel()
            self.val_min_loss = np.Inf
            self.epoch_start = -1
            
    def index_of_first(self,lst, pred):
        for i, v in enumerate(lst):
            if pred(v):
                return i
        return None

    def split_paragraph_if_needed(self,paragraph, question, answer_span):
        """
        Attempts to tokenize a paragraph and question together, if too long
        because of tokenizer's max length, then will split the paragraph into
        multiple slices.

        Returns a list of paragraph slices with answer span, such that:
            - a paragraph slice with no answer will have answer mapped to (CLS, CLS)
            - a paragraph slice with answer will be mapped to the index of answer.
        """
        tokenized_input_pair = self.tokenizer_dict['tokenizer_process'](question, paragraph)
        # outputs
        paragraph_splits = []
        answer_spans = []
        # get answer end char idx
        ans_start = answer_span[0]
        ans_end = answer_span[1]
        """
        1) Find index of context segments in the tokenized example
        2) Within the context segments (start from context_segment_idx), 
           find the token corresponding to span of answer: start and end.
        """
        for offset_idx, offset in enumerate(tokenized_input_pair.offset_mapping):
            # get sequence ids
            sequence_ids = tokenized_input_pair.sequence_ids(offset_idx)
            # find start index of context segment
            context_segment_idx = sequence_ids.index(1)
            span_start_offset_idx = self.index_of_first(
                tokenized_input_pair.offset_mapping[offset_idx][context_segment_idx:], 
                lambda span: span[0] <= ans_start <= span[1]
            )
            span_end_offset_idx = self.index_of_first(
                tokenized_input_pair.offset_mapping[offset_idx][context_segment_idx:], 
                lambda span: span[0] <= ans_end <= span[1]
            )
            # Decode split into a string
            decoded_split = self.tokenizer_dict['tokenizer'].decode(tokenized_input_pair.input_ids[offset_idx][context_segment_idx:], skip_special_tokens=True)
            # 
            paragraph_splits.append(decoded_split)
            if span_start_offset_idx is not None and span_end_offset_idx is not None:
                # If answer span is fully in current slice
                # add segment idx offset
                span_start_offset_idx += context_segment_idx
                span_end_offset_idx += context_segment_idx + 1 # the plus 1 is needed for correct slicing
                answer_spans.append((span_start_offset_idx, span_end_offset_idx))
            elif span_start_offset_idx is None and span_end_offset_idx is None:
                # If span not in this slice, but in another slice
                # map answer to (CLS, CLS)
                cls_idx = tokenized_input_pair.input_ids[offset_idx].index(self.tokenizer_dict['tokenizer'].cls_token_id)
                # NOTE(Alex): although I think it's always 0
                answer_spans.append((cls_idx, cls_idx))
            else:
                # span spans along multiple slices -> throw the sample away 
                # (should be only like 4 samples across the whole dataset)
                # Discard sample
                pass

        return (paragraph_splits, answer_spans)   

    def preprocessData(self,data):
        dataframe_list = []
        for conv_id in tqdm(data.conversation_id.unique()):
#            try:
            conversation = data[data.conversation_id == conv_id].context.unique()[0]
            question = data[data.conversation_id == conv_id].question.tolist()
            answer = data[data.conversation_id == conv_id].answers.tolist()
            ## for one conversation_id - 
            for idx in range(len(question)):
                ##for each question inside conversation
                ques_text = question[idx].strip()
                answer_text = eval(answer[idx])['text'].strip()
                answer_start = eval(answer[idx])['answer_start']
                answer_end = answer_start + len(answer_text)
                par_splits, split_answer_spans = self.split_paragraph_if_needed(
                                    conversation, 
                                    ques_text, 
                                    (answer_start, answer_end)
                                )
                #pair_overflows = len(par_splits) > 1
                for split_idx, (split_text, split_ans_span) in enumerate(zip(par_splits, split_answer_spans)):
                    #print(answer_text)
                    dataframe_list.append({
                        'conversation_id':conv_id,
                        'paragraph':split_text,
                        'question_id':idx,
                        'question_text':ques_text,
                        'answer_text':answer_text,
                        'answer_start':answer_start,
                        'tokenizer_answer_start':split_ans_span[0],
                        'tokenizer_answer_end':split_ans_span[1],
                    })
                #print(f'proccesed conv_id - {conv_id}')
#             except:
#                 print(f'conversation_id - {conv_id}')
        return pd.DataFrame(dataframe_list)
    
    def get_params_for_optimizer(self,model, no_decay, weight_decay=0.0001):
        param_optimizer = list(model.named_parameters())
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer if not any(
                        nd in n for nd in no_decay
                    )
                ], 
                'weight_decay': weight_decay
            },
            {
                'params': [
                    p for n, p in param_optimizer if any(
                        nd in n for nd in no_decay
                    )
                ],
                'weight_decay': 0.0
            },
        ]
        return optimizer_parameters
    
    
    def loadModel(self,load_model=False):
        if load_model:
            model = ParametricBertModelQA(self.utilsObj.model_path)
        else:
            model = ParametricBertModelQA(self.params_dict['model_url'])
            model.model.save_pretrained(f'{self.utilsObj.model_path}/')
        
        if self.device=='cuda':
            model = model.cuda()
        
        # Define parameters on which to apply L2 decay
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        if self.params_dict["train_params"]["weight_decay"] > 0.0:
            model_params_optimizer = self.get_params_for_optimizer(model, no_decay, weight_decay=self.params_dict["train_params"]["weight_decay"])
        else:
            model_params_optimizer = model.parameters()

        # Define optimizer
        optimizer = AdamW(
            model_params_optimizer, 
            lr=params_dict["train_params"]["initial_lr"], 
            correct_bias=False
        )
        return model,optimizer
        
    def loadTokenizer(self,load_model=False):
        if load_model:
            tokenizer = AutoTokenizer.from_pretrained(self.utilsObj.model_path)
        else:
            tokenizer = AutoTokenizer.from_pretrained(self.params_dict["tokenizer_url"])
        tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=self.params_dict["tokenizer_max_length"]-3)
        tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=self.params_dict["tokenizer_max_length"])
        tokenizer.save_pretrained(f'{self.utilsObj.model_path}/')
        tokenizer_dict = {"tokenizer":tokenizer,
                          "tokenizer_process":tokenizer_fn_preprocess,
                          "tokenizer_train":tokenizer_fn_train}
        return tokenizer_dict
    
    def train_step(self,scaler, loss_function, dataloader, scheduler=None, device="cpu", show_progress=False):
        acc_loss = 0
        acc_start_accuracy = 0
        acc_end_accuracy = 0
        count = 0

        time_start = timer()

        self.model.train()
        wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader
        for batch in wrapped_dataloader:
            # NOTE: we'll pass directly the batch dict to the model for inputs.
            answer_spans_start = batch["y_gt"][:, 0]
            answer_spans_end = batch["y_gt"][:, 1]
            # Clear gradients
            self.model.zero_grad()
            # Place to right device
            answer_spans_start = answer_spans_start.to(device)
            answer_spans_end = answer_spans_end.to(device)
            # Use Automatic Mixed Precision if enabled
            with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
                # Run forward pass
                pred_answer_start_scores, pred_answer_end_scores = self.model(batch)
                pred_answer_start_scores = pred_answer_start_scores.to(device)
                pred_answer_end_scores = pred_answer_end_scores.to(device)

                # Compute the CrossEntropyLoss
                loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
            scaler.scale(loss).backward()
            # Optimizer step (via scaler)
            scaler.step(self.optimizer)
            scaler.update()
            # Update LR scheduler
            if scheduler is not None:
                scheduler.step()
            # --- Compute metrics ---
            # Get span indexes
            pred_span_start_idxs, pred_span_end_idxs = extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
            gt_start_idxs = answer_spans_start.cpu().detach()
            gt_end_idxs = answer_spans_end.cpu().detach()
            # two accs
            start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
            end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
            # Gather stats
            acc_loss += loss.item()
            acc_start_accuracy += start_accuracy.item()
            acc_end_accuracy += end_accuracy.item()
            count += 1
        time_end = timer()
        return {
            "loss": acc_loss / count, 
            "accuracy_start": acc_start_accuracy / count, 
            "accuracy_end": acc_end_accuracy / count,
            "time": time_end - time_start
        }
    
    @torch.no_grad()
    def validation_step(self,scaler, loss_function, dataloader, device="cpu", show_progress=False):
        acc_loss = 0
        acc_start_accuracy = 0
        acc_end_accuracy = 0
        count = 0

        time_start = timer()
        wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader

        self.model.eval()
        for batch in wrapped_dataloader:
            answer_spans_start = batch["y_gt"][:, 0]
            answer_spans_end = batch["y_gt"][:, 1]
            # Place to right device
            answer_spans_start = answer_spans_start.to(device)
            answer_spans_end = answer_spans_end.to(device)
            # Use Automatic Mixed Precision if enabled
            with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
                # Run forward pass
                pred_answer_start_scores, pred_answer_end_scores = self.model(batch)
                pred_answer_start_scores = pred_answer_start_scores.to(device)
                pred_answer_end_scores = pred_answer_end_scores.to(device)
                # Compute the CrossEntropyLoss
                loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
            # --- Compute metrics ---
            # Get span indexes
            pred_span_start_idxs, pred_span_end_idxs = extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
            gt_start_idxs = answer_spans_start.cpu().detach()
            gt_end_idxs = answer_spans_end.cpu().detach()
            # two accs
            start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
            end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
            # Gather stats
            acc_loss += loss.item()
            acc_start_accuracy += start_accuracy.item()
            acc_end_accuracy += end_accuracy.item()
            count += 1
        time_end = timer()
        return {
            "loss": acc_loss / count, 
            "accuracy_start": acc_start_accuracy / count, 
            "accuracy_end": acc_end_accuracy / count,
            "time": time_end - time_start
        }        
    
    def train_model(self,train_process_df,val_process_df,epochs=None,batch_size=None,use_amp = True):

        if epochs:
            self.params_dict["train_params"]["epochs"] = epochs
        if batch_size:
            self.params_dict['train_params']['batch_size_train'] = batch_size
            self.params_dict['train_params']['batch_size_val'] = batch_size
        # Estimate the number of train steps for LR scheduler
        num_train_steps = int(
            (len(train_process_df) / self.params_dict["train_params"]["batch_size_train"]) * self.params_dict["train_params"]["epochs"]
        )

        num_warmup_steps = int(num_train_steps * 0.1) # 10% of warmup steps

        # LR scheduler
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_steps
        )
        loss_function = nn.CrossEntropyLoss()
        if self.device=='cuda':
            loss_function = loss_function.cuda()
        scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
        history = {
            "train_loss": [], "train_acc_start": [], "train_acc_end": [],
            "val_loss": [], "val_acc_start": [], "val_acc_end": []
        }
        #creating dataloader
        train_dataset = CustomQADatasetBERT(self.tokenizer_dict['tokenizer_train'],train_process_df)
        val_dataset = CustomQADatasetBERT(self.tokenizer_dict['tokenizer_train'],val_process_df)
        
        train_data_loader = torch.utils.data.DataLoader(train_dataset, 
                                                collate_fn = bert_padder_collate_fn, 
                                                batch_size=self.params_dict['train_params']['batch_size_train'],
                                                shuffle=True)
        val_data_loader = torch.utils.data.DataLoader(val_dataset, 
                                              collate_fn = bert_padder_collate_fn, 
                                              batch_size=self.params_dict['train_params']['batch_size_val'], 
                                              shuffle=True)
        
        loop_start = timer()
        self.utilsObj.save_params(self.params_dict)
        for epoch in range(self.epoch_start+1,self.params_dict["train_params"]["epochs"]):
            train_dict = self.train_step(scaler, loss_function, train_data_loader,scheduler=scheduler,device=self.device,show_progress=True)
            val_dict = self.validation_step(scaler, loss_function, val_data_loader,device=self.device)
            cur_lr = self.optimizer.param_groups[0]['lr']
            checkpoints = {
              'epoch':epoch,
              'state_dict':self.model.state_dict(),
              'optimizer_state_dict': self.optimizer.state_dict(),
              'validation_loss':val_dict['loss'],
              'training_loss':train_dict['loss']
            }
            self.utilsObj.save_checkpoints(checkpoints)
            if val_dict["loss"] < self.val_min_loss:
                print(f'validation loss decreased {self.val_min_loss:.3f} --> {val_dict["loss"]:.3f}')
                self.val_min_loss = val_dict["loss"]
                self.utilsObj.save_checkpoints(checkpoints,True)
            print(f'Epoch: {epoch}, '
                  f'lr: {cur_lr}, '
                  f'Train loss: {train_dict["loss"]:.4f}, '
                  f'Train acc start: {train_dict["accuracy_start"]:.4f}, '
                  f'Train acc end: {train_dict["accuracy_end"]:.4f}, '
                  f'Val loss: {val_dict["loss"]:.4f}, '
                  f'Val acc start: {val_dict["accuracy_start"]:.4f}, '
                  f'Val acc end: {val_dict["accuracy_end"]:.4f}, '
                  f'Time: {train_dict["time"]:.4f}')
            history["train_loss"].append(train_dict["loss"]);history["train_acc_start"].append(train_dict["accuracy_start"]);history["train_acc_end"].append(train_dict["accuracy_end"]);
            history["val_loss"].append(val_dict["loss"]);history["val_acc_start"].append(val_dict["accuracy_start"]);history["val_acc_end"].append(val_dict["accuracy_end"]);
        loop_end = timer()
        print(f"Elapsed time: {(loop_end - loop_start):.4f}")
        self.utilsObj._saveResult(history)
        self.utilsObj.save_best_model(self.model)

## Execute Function

In [17]:
data = pd.read_excel('drive/MyDrive/PII_Training/squad.xlsx')

In [18]:
data.head()

Unnamed: 0,conversation_id,context,question,answers
0,2571,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': 'in the late 1990s', 'answer_start': ..."
1,2571,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': 'singing and dancing', 'answer_start'..."
2,2571,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': '2003', 'answer_start': 526}"
3,2571,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': 'Houston, Texas', 'answer_start': 166}"
4,2571,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': 'late 1990s', 'answer_start': 276}"


In [19]:
train_df,val_df = splitDataFrame(data,limit=20)

In [20]:
train_df.shape,val_df.shape

((175, 4), (76, 4))

In [49]:
params_dict = {
        "model_url": "distilbert-base-cased-distilled-squad",
        "tokenizer_url": "distilbert-base-cased-distilled-squad",
        "tokenizer_max_length": 384,
        "train_params": {
            "epochs": 2,
            "initial_lr": 0.00003,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "weight_decay": 0.01
        } }

## Train Model

In [22]:
train_args = {'params_dict':params_dict,
       'folder_name':'distilbert'}

In [34]:
modelObj = QAModel(train_args=train_args)



In [24]:
train_process_df = modelObj.preprocessData(train_df)
val_process_df = modelObj.preprocessData(val_df)

100%|██████████| 14/14 [00:00<00:00, 20.88it/s]
100%|██████████| 6/6 [00:00<00:00, 13.96it/s]


In [35]:
modelObj.train_model(train_process_df,val_process_df)

100%|██████████| 24/24 [00:06<00:00,  3.69it/s]


validation loss decreased inf --> 1.711
Epoch: 0, lr: 1.6046511627906977e-05, Train loss: 2.3701, Train acc start: 0.6438, Train acc end: 0.3000, Val loss: 1.7114, Val acc start: 0.7250, Val acc end: 0.4500, Time: 6.5086


100%|██████████| 24/24 [00:03<00:00,  7.88it/s]


validation loss decreased 1.711 --> 1.446
Epoch: 1, lr: 0.0, Train loss: 1.2061, Train acc start: 0.7156, Train acc end: 0.5719, Val loss: 1.4459, Val acc start: 0.7125, Val acc end: 0.5000, Time: 3.0527
Elapsed time: 23.7825
----- model is saved at drive/MyDrive/distilbert/2023_02_22_054405 ------


## load checkpoints

In [40]:
load_args = {'model_ckp':'distilbert/2023_02_22_054405',
'ckp_status':'last'}

In [46]:
modelObjckp = QAModel(load_args=load_args)



In [48]:
modelObjckp.train_model(train_process_df,val_process_df,epochs=5)

100%|██████████| 24/24 [00:03<00:00,  7.83it/s]


validation loss decreased 1.446 --> 1.404
Epoch: 2, lr: 2.635514018691589e-05, Train loss: 0.6235, Train acc start: 0.8042, Train acc end: 0.7604, Val loss: 1.4040, Val acc start: 0.7375, Val acc end: 0.5375, Time: 3.0719


100%|██████████| 24/24 [00:03<00:00,  7.67it/s]


Epoch: 3, lr: 1.9626168224299065e-05, Train loss: 0.3828, Train acc start: 0.9010, Train acc end: 0.8958, Val loss: 1.5869, Val acc start: 0.7625, Val acc end: 0.6125, Time: 3.1392


100%|██████████| 24/24 [00:03<00:00,  7.71it/s]


Epoch: 4, lr: 1.2897196261682243e-05, Train loss: 0.2630, Train acc start: 0.9375, Train acc end: 0.9115, Val loss: 1.6107, Val acc start: 0.7250, Val acc end: 0.5875, Time: 3.1240
Elapsed time: 25.4227
----- model is saved at drive/MyDrive/distilbert/2023_02_22_054405 ------


## Model Predict

In [63]:
from transformers import pipeline

In [64]:
context = data.context.iloc[0]
ques = data.question.iloc[0]
answer = data.answers.iloc[0]

In [68]:
qa_model = pipeline("question-answering", model='drive/MyDrive/distilbert/2023_02_22_054405/')

In [69]:
qa_model(question=ques,context=context)

{'score': 0.6709728837013245,
 'start': 281,
 'end': 294,
 'answer': '1990s as lead'}