In [1]:
from transformers import AutoTokenizer

In [3]:
MODEL_PATH = "../models/distilbert-base-uncased"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [4]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at ../models/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 10
    num_workers = 2
    learning_rate = 3e-5
    patience = 2
    dropout = 0.5
    model_path = "/kaggle/working"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

options = Options()    

In [9]:
import torch
import json
import torch.nn as nn

class TweetDataset(torch.utils.data.Dataset):
   def __init__(self, jsonl_file):
      self.jsonl_file = jsonl_file
      self.len = 0
      with open(jsonl_file, 'r', encoding='utf-8') as f:
         for _ in f:
            self.len += 1
        
   def __getitem__(self, idx):
      index = 0
      sample = None
      with open(self.jsonl_file, 'r', encoding='utf-8') as f:
         while index < idx:
            line = f.readline()
            sample = json.loads(line)
            index += 1
      return sample
    
   def __len__(self):
        return self.len

In [None]:
def make_loaders(jsonl_file, options):
   dataset = TweetDataset(jsonl_file)
   dataloader = torch.utils.data.DataLoader(dataset, 
                                             batch_size=options.batch_size, 
                                             shuffle=False,
                                             num_workers=options.num_workers)
   return dataloader

In [18]:
def append_train_data(item, mode='a'):
   with open('test.jsonl', mode, encoding='utf-8') as f:
      line = json.dumps(item)
      f.write(line)
      f.flush()

In [None]:
append_train_data({
   'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
   'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
   'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
}, mode='w')

In [32]:
append_train_data({
   'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
   'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
   'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
})

In [49]:
from datasets import load_dataset
my_dataset = load_dataset('json', data_files='./test.jsonl')
ds2 = load_dataset('json', data_files='./test.jsonl')
my_dataset['test'] = ds2['train']

In [50]:
my_dataset

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'question'],
        num_rows: 11
    })
    test: Dataset({
        features: ['answers', 'context', 'question'],
        num_rows: 11
    })
})

In [13]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [51]:
tokenized_squad = my_dataset.map(preprocess_function, batched=True, remove_columns=my_dataset["train"].column_names)

In [35]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 11
    })
})

In [15]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [16]:
training_args = TrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   weight_decay=0.01,
   push_to_hub=False,
)

In [52]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_squad["train"],
   eval_dataset=tokenized_squad["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
)

[codecarbon INFO @ 22:11:30] [setup] RAM Tracking...
[codecarbon INFO @ 22:11:30] [setup] GPU Tracking...
[codecarbon INFO @ 22:11:30] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 22:11:30] [setup] CPU Tracking...
[codecarbon INFO @ 22:11:32] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700K
[codecarbon INFO @ 22:11:32] >>> Tracker's metadata:
[codecarbon INFO @ 22:11:32]   Platform system: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
[codecarbon INFO @ 22:11:32]   Python version: 3.10.12
[codecarbon INFO @ 22:11:32]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 22:11:32]   Available RAM : 15.619 GB
[codecarbon INFO @ 22:11:32]   CPU count: 20
[codecarbon INFO @ 22:11:32]   CPU model: 12th Gen Intel(R) Core(TM) i7-12700K
[codecarbon INFO @ 22:11:32]   GPU count: 1
[codecarbon INFO @ 22:11:32]   GPU model: 1 x NVIDIA GeForce RTX 3060


In [53]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,5.605518
2,No log,5.38102
3,No log,5.259762


[codecarbon INFO @ 22:12:07] Energy consumed for RAM : 0.000003 kWh. RAM Power : 5.857310771942139 W
[codecarbon INFO @ 22:12:07] Energy consumed for all GPUs : 0.000082 kWh. Total GPU Power : 141.364 W
[codecarbon INFO @ 22:12:07] Energy consumed for all CPUs : 0.000025 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 22:12:07] 0.000110 kWh of electricity used since the beginning.
  df = pd.concat([df, pd.DataFrame.from_records([dict(data.values)])])


TrainOutput(global_step=3, training_loss=5.673928578694661, metrics={'train_runtime': 2.0782, 'train_samples_per_second': 15.879, 'train_steps_per_second': 1.444, 'total_flos': 3233664225792.0, 'train_loss': 5.673928578694661, 'epoch': 3.0})

In [54]:
trainer.save_model("./results/distilbert")

In [55]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [70]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model="./results/distilbert")
result = question_answerer(question=question, context=context)
result

{'score': 0.004142954014241695,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}

In [63]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./results/distilbert")
inputs = tokenizer(question, context, return_tensors="pt")


In [64]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("./results/distilbert")
with torch.no_grad():
    outputs = model(**inputs)

In [71]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()
print(f"{answer_start_index=}")
print(f"{answer_end_index=}")

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

answer = tokenizer.decode(predict_answer_tokens)
answer

12


''

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=options.learning_rate)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 
                                                         mode="min", 
                                                         factor=0.5, 
                                                         patience=2)
criterion = nn.CrossEntropyLoss()