In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!pip install kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d ananthu017/squad-csv-format
!unzip squad-csv-format


Saving kaggle.json to kaggle.json
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading squad-csv-format.zip to /content
 57% 5.00M/8.75M [00:00<00:00, 12.3MB/s]
100% 8.75M/8.75M [00:00<00:00, 19.8MB/s]
Archive:  squad-csv-format.zip
  inflating: SQuAD_csv.csv           


In [2]:
import pandas as pd 

df = pd.read_csv('/content/SQuAD_csv.csv')
df


Unnamed: 0.1,Unnamed: 0,context,question,id,answer_start,text
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,56be85543aeaaa14008c9063,269,in the late 1990s
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,56be85543aeaaa14008c9065,207,singing and dancing
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,56be85543aeaaa14008c9066,526,2003
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,56bf6b0f3aeaaa14008c9601,166,"Houston, Texas"
4,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,56bf6b0f3aeaaa14008c9602,276,late 1990s
...,...,...,...,...,...,...
86816,271,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,5735d259012e2f140011a09d,229,Oregon
86817,272,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,5735d259012e2f140011a09e,414,Rangoon
86818,273,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,5735d259012e2f140011a09f,476,Minsk
86819,274,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,5735d259012e2f140011a0a0,199,1975


In [3]:
import torch
import torch.nn as nn

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [53]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, 
                 tokenizer,
                 df,
                 context_column = "context",
                 question_column = "question",
                 answer_column = "text"):
        self.tokenizer = tokenizer 
        self.df = df 
        self.context_column = context_column
        self.question_column = question_column
        self.answer_column = answer_column

    def __getitem__(self, index):
        context = self.df[self.context_column].iloc[index]
        question = self.df[self.question_column].iloc[index]
        question = context + question
        answer = self.df[ self.answer_column].iloc[index]

        question_tensor = torch.LongTensor(self.tokenizer.encode(str(question))).to(device)
        answer_tensor = torch.LongTensor(self.tokenizer.encode(str(answer))).to(device)
        return {
            "answer" : answer_tensor,
            "question" : question_tensor
        }

    def __len__(self):
        return len(self.df)

In [54]:
from torch.utils.data import random_split

dataset = TextDataset(tokenizer, df)

train_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=10, shuffle=True)

In [55]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)


In [56]:
model(train_dataset[0]['question'].unsqueeze(0))[0].shape

torch.Size([1, 174, 50257])

In [57]:
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-2)
criterion = nn.CrossEntropyLoss()

In [58]:
from tqdm.auto import tqdm

num_epochs = 3

for epoch in tqdm(range(num_epochs)):
    sum_loss, cnt_loss = 0, 0
    pbar = tqdm(train_dataset, desc=f"Epoch {epoch + 1}. Train Loss: {0}")
    for batch in pbar:
        optimizer.zero_grad() 
        
        input = torch.cat([batch['question'], batch['answer']], 0)
        output_model = model(input)[0]
        interesting_output_model = output_model[(len(batch['question']) - 1):]
        interesting_output_model = interesting_output_model[:(len(interesting_output_model) - 1)]
        loss = criterion(interesting_output_model, batch['answer'])
        sum_loss += loss.item()
        cnt_loss += 1
        loss.backward()

        optimizer.step()
        pbar.set_description(f"Epoch {epoch + 1}. Train Loss: {loss.item()}")

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1. Train Loss: 0:   0%|          | 0/8682 [00:00<?, ?it/s]

Epoch 2. Train Loss: 0:   0%|          | 0/8682 [00:00<?, ?it/s]

Epoch 3. Train Loss: 0:   0%|          | 0/8682 [00:00<?, ?it/s]

In [59]:
ind = 0
query = df['context'].iloc[ind] + df['question'].iloc[ind]
target = df['text'].iloc[ind]
inputs = tokenizer(query, return_tensors="pt")
inputs['input_ids'] = inputs['input_ids'].to(device)
inputs['attention_mask'] = inputs['attention_mask'].to(device)

generation_output = model.generate(**inputs, max_length=30)
outputs = tokenizer.decode(generation_output.cpu().flatten().numpy(), skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 182, but ``max_length`` is set to 30. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [60]:
print(">>>>>>>>>>>>>>>>>>>", query)
print("<<<<<<<<<<<<<<<<<<<", outputs[len(query):])
print("===================", target)

>>>>>>>>>>>>>>>>>>> Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".When did Beyonce start becoming popular?
<<<<<<<<<<<<<<<<<<< June


In [61]:
ind = 1
query = df['context'].iloc[ind] + df['question'].iloc[ind]
target = df['text'].iloc[ind]
inputs = tokenizer(query, return_tensors="pt")
inputs['input_ids'] = inputs['input_ids'].to(device)
inputs['attention_mask'] = inputs['attention_mask'].to(device)

generation_output = model.generate(**inputs, max_length=30)
outputs = tokenizer.decode(generation_output.cpu().flatten().numpy(), skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 187, but ``max_length`` is set to 30. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [62]:
print(">>>>>>>>>>>>>>>>>>>", query)
print("<<<<<<<<<<<<<<<<<<<", outputs[len(query):])
print("===================", target)

>>>>>>>>>>>>>>>>>>> Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".What areas did Beyonce compete in when she was growing up?
<<<<<<<<<<<<<<<<<<< June


In [63]:
ind = 2
query = df['context'].iloc[ind] + df['question'].iloc[ind]
target = df['text'].iloc[ind]
inputs = tokenizer(query, return_tensors="pt")
inputs['input_ids'] = inputs['input_ids'].to(device)
inputs['attention_mask'] = inputs['attention_mask'].to(device)

generation_output = model.generate(**inputs, max_length=30)
outputs = tokenizer.decode(generation_output.cpu().flatten().numpy(), skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 188, but ``max_length`` is set to 30. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [64]:
print(">>>>>>>>>>>>>>>>>>>", query)
print("<<<<<<<<<<<<<<<<<<<", outputs[len(query):])
print("===================", target)

>>>>>>>>>>>>>>>>>>> Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".When did Beyonce leave Destiny's Child and become a solo singer?
<<<<<<<<<<<<<<<<<<< June
