In [None]:
!pip install transformers

In [None]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import pandas as pd
import transformers
from transformers import AlbertTokenizerFast
from transformers import AlbertModel, AlbertConfig, AlbertPreTrainedModel
from tqdm import tqdm

In [None]:
df = pd.read_csv("train_data.csv")
df.drop("Unnamed: 0",axis=1, inplace=True)
df.head()

Unnamed: 0,Theme,Paragraph,Question,Answer_possible,Answer_text,Answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,True,['2003'],[526]
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,True,['Dangerously in Love'],[505]
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,True,['Mathew Knowles'],[360]
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,True,['late 1990s'],[276]
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,True,['lead singer'],[290]


In [None]:
# preprocessing in text
df['Answer_text'] = df['Answer_text'].apply(lambda x: x.lstrip("[").rstrip("]").strip("'").strip('''"'''))
df['Answer_text'] = df['Answer_text'].apply(lambda x: x.replace("\\",""))
df['Answer_start'] = df['Answer_start'].apply(lambda x: x.lstrip('[').rstrip(']'))
df.iloc[37668, 4] = df.iloc[37668, 4].replace("ufeff", "")
df.iloc[37668, 1] = df.iloc[37668, 1].replace("\ufeff", "")
df.head()

Unnamed: 0,Theme,Paragraph,Question,Answer_possible,Answer_text,Answer_start
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,True,2003,526
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,True,Dangerously in Love,505
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,True,Mathew Knowles,360
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,True,late 1990s,276
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,True,lead singer,290


In [None]:
def add_end_idx(df):
  end_idx_list = []
  for i in range(len(df)):
    gold_text = df.iloc[i, 4]
    context = df.iloc[i,1]

    if df.iloc[i,3] == True:
      start_idx = int(df.iloc[i, 5])
      end_idx = start_idx + len(gold_text)
      

      # sometimes squad answers are off by a character or two so we fix this
      if context[start_idx : end_idx] == gold_text:
        df.iloc[i, 5] = start_idx
        end_idx_list.append(end_idx)
      elif context[start_idx - 1:end_idx - 1] == gold_text:
        df.iloc[i, 5] = start_idx - 1
        end_idx_list.append(end_idx - 1) 
      elif context[start_idx + 1:end_idx + 1] == gold_text:
        df.iloc[i, 5] = start_idx + 1
        end_idx_list.append(end_idx + 1)     
      elif context[start_idx - 2:end_idx - 2] == gold_text:
        df.iloc[i, 5] = start_idx - 2
        end_idx_list.append(end_idx - 2)       
      elif context[start_idx + 2:end_idx + 2] == gold_text:
        df.iloc[i, 5] = start_idx + 2
        end_idx_list.append(end_idx + 2)  
   
      else:
        #print(i)
        print(context[start_idx:end_idx], gold_text)

    else:
      df.iloc[i, 5] = 0
      end_idx_list.append(0)

  df['Answer_end'] = end_idx_list
  return df

df_preprocessed = add_end_idx(df.copy())
df_preprocessed.head()

Unnamed: 0,Theme,Paragraph,Question,Answer_possible,Answer_text,Answer_start,Answer_end
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,True,2003,526,530
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What album made her a worldwide known artist?,True,Dangerously in Love,505,524
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Who managed the Destiny's Child group?,True,Mathew Knowles,360,374
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé rise to fame?,True,late 1990s,276,286
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What role did Beyoncé have in Destiny's Child?,True,lead singer,290,301


### Utils

In [None]:
def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return round(2 * (prec * rec) / (prec + rec), 2)



def compute_f1_batch(outputs, batch, tokenizer):
  answer_start = outputs[1].argmax(dim=1)  
  answer_end = outputs[2].argmax(dim=1) 

  #print(answer_start.size(), answer_end.size())
    
  truths = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input[start:end])) \
                                   for input, start, end in zip(batch['input_ids'].tolist(), batch['start_positions'].tolist(), batch['end_positions'].tolist())]

  #print(truths)

  predictions = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input[start:end])) \
                                   for input, start, end in zip(batch['input_ids'].tolist(), answer_start.tolist(), answer_end.tolist())]

  f1_acc = 0
  for pred, truth in zip(predictions, truths):
    f1_acc += compute_f1(pred, truth)

  return round(f1_acc/len(truths), 3)



### Data Loader

In [None]:
class CustomDatset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer):
    self.data = df
    self.tokenizer = tokenizer
    

  def __len__(self):
    return len(self.data)

  def get_answers(self, idx):
    answer_text = self.data.loc[idx]['Answer_text']
    answer_start = self.data.loc[idx]['Answer_start']
    answer_end = self.data.loc[idx]['Answer_end']
    # return [{'text': text, 'answer_start': start, 'answer_end': end}  \
    #         for text, start, end in zip(list(answer_text), list(answer_start), list(answer_end))]
    return {'text': answer_text, 'answer_start': answer_start, 'answer_end': answer_end}

  def add_token_positions(self, encodings, answers):
    start_positions = encodings.char_to_token(answers['answer_start'])
    end_positions = encodings.char_to_token(answers['answer_end'])

    # if start position is None, the answer passage has been truncated
    if start_positions is None:
      start_positions = tokenizer.model_max_length

    # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
    if end_positions is None:
      end_positions = encodings.char_to_token(answers['answer_end'] + 1)
      
    if end_positions is None:
      end_positions = encodings.char_to_token(answers['answer_end'] - 1)

    if end_positions is None:
      end_positions = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

  def __getitem__(self, idx):
    #print(idx)
    contexts = self.data.loc[idx]['Paragraph']
    questions = self.data.loc[idx]['Question']
    encodings = self.tokenizer(contexts, questions, max_length = 512, truncation=True, padding='max_length')
    answers = self.get_answers(idx)
    encodings = self.add_token_positions(encodings, answers) 
    return {key: torch.tensor(val) for key, val in encodings.items()}

### Model

In [None]:
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
    def __init__(self, config):
        super(AlbertForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels

        self.albert = AlbertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                inputs_embeds=None, start_positions=None, end_positions=None):

        outputs = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        outputs = (start_logits, end_logits,) + outputs[2:]
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs

        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

### Split data to train and validation

Random Split

In [None]:
train_df = df_preprocessed[df_preprocessed['Theme'] != 'Hunting']
train_df.reset_index(inplace=True)
len(train_df)

74680

In [None]:
val_df = df_preprocessed[df_preprocessed['Theme'] == 'Hunting']
val_df.reset_index(inplace=True)
len(val_df)

375

### Model Training

In [None]:
albert_model = 'albert-base-v1'
config = AlbertConfig.from_pretrained(albert_model) 
tokenizer = AlbertTokenizerFast.from_pretrained(albert_model)
qa = AlbertForQuestionAnswering(config)
model = qa.from_pretrained(albert_model)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForQuestionAnswering: ['predictions.decoder.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN t

In [None]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'You are working on {device}')

You are working on cuda


In [None]:
N_EPOCHS = 5

train_data = CustomDatset(train_df, tokenizer)
val_data = CustomDatset(val_df, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=4)

optim = transformers.AdamW(model.parameters(), lr=2e-5, weight_decay = 0.01, no_deprecation_warning=True)

# load model if exist, else comment the line
# model = torch.load("qa_albertEncoder_2.pth", map_location=device)

model.to(device)


for epoch in range(N_EPOCHS):
  model.train()
  loop = tqdm(train_dataloader, leave=True)
  
  train_loss = []
  train_f1 = []
  for batch in loop:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    optim.zero_grad()
    loss.backward()
    optim.step()

    # F1 calculation
    f1 = compute_f1_batch(outputs, batch, tokenizer)

    train_loss.append(loss.item())
    train_f1.append(f1)

    loop.set_description(f'Epoch {epoch+1} Training')
    loop.set_postfix(loss=loss.item(), F1=f1)


  # saving the model 
  model_path = f"qa_albertEncoder_{epoch+1}.pth"
  torch.save(model, model_path)

  # validation
  model.eval()
  loop = tqdm(val_dataloader, leave=True)
  val_f1 = []
  val_loss = []
  for batch in loop:
    with torch.no_grad():
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      
      # F1 calculation
      f1 = compute_f1_batch(outputs, batch, tokenizer)

      val_loss.append(loss.item())
      val_f1.append(f1)

      loop.set_description(f'Epoch {epoch+1} Validation')
      loop.set_postfix(loss=loss.item(), F1=f1)

  print(f'\nEnd of epoch {epoch+1}|Training Loss: {sum(train_loss)/len(train_loss):.3f} F1 Score: {sum(train_f1)/len(train_f1):.3f}\
  |Validation Loss: {sum(val_loss)/len(val_loss):.3f} F1 Score: {sum(val_f1)/len(val_f1):.3f}')

NameError: name 'CustomDatset' is not defined

In [None]:
N_EPOCHS = 1

train_data = CustomDatset(train_df, tokenizer)
val_data = CustomDatset(val_df, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=4)

optim = transformers.AdamW(model.parameters(), lr=2e-5, weight_decay = 0.01, no_deprecation_warning=True)

# load model if exist, else comment the line
model = torch.load("qa_albertEncoder_4.pth", map_location=device)

model.to(device)


for epoch in range(4, N_EPOCHS+4):
  model.train()
  loop = tqdm(train_dataloader, leave=True)
  
  train_loss = []
  train_f1 = []
  for batch in loop:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    optim.zero_grad()
    loss.backward()
    optim.step()

    # F1 calculation
    f1 = compute_f1_batch(outputs, batch, tokenizer)

    train_loss.append(loss.item())
    train_f1.append(f1)

    loop.set_description(f'Epoch {epoch+1} Training')
    loop.set_postfix(loss=loss.item(), F1=f1)


  # saving the model 
  model_path = f"qa_albertEncoder_{epoch+1}.pth"
  torch.save(model, model_path)

  # validation
  model.eval()
  loop = tqdm(val_dataloader, leave=True)
  val_f1 = []
  val_loss = []
  for batch in loop:
    with torch.no_grad():
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      
      # F1 calculation
      f1 = compute_f1_batch(outputs, batch, tokenizer)

      val_loss.append(loss.item())
      val_f1.append(f1)

      loop.set_description(f'Epoch {epoch+1} Validation')
      loop.set_postfix(loss=loss.item(), F1=f1)

  print(f'\nEnd of epoch {epoch+1}|Training Loss: {sum(train_loss)/len(train_loss):.3f} F1 Score: {sum(train_f1)/len(train_f1):.3f}\
  |Validation Loss: {sum(val_loss)/len(val_loss):.3f} F1 Score: {sum(val_f1)/len(val_f1):.3f}')

Epoch 5 Training: 100%|████████████████████████████████████████| 9335/9335 [59:52<00:00,  2.60it/s, F1=0.675, loss=1.35]
Epoch 5 Validation: 100%|██████████████████████████████████████████| 94/94 [00:06<00:00, 14.21it/s, F1=0.667, loss=0.83]


End of epoch 5|Training Loss: 0.481 F1 Score: 0.873  |Validation Loss: 0.835 F1 Score: 0.799





In [None]:
TOKENIZERS_PARALLELISM = False