In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sheet_id = "1kBPSqge0PG6TlHsk46MDq1XD-s2UNeceRxEGqGOTOgo"
sheet_name = 'Sheet1'

# Definitions

## DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pandas as pd

class TextDataset(Dataset):
    def __init__(self, sheet_id, sheet_name, max_length=128):
        """["sent1", "sent2", ....]"""
        # 1. read padnas dataframe from url
        url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
        df = pd.read_csv(url)
        df.fillna('', inplace=True)

        # 2. make the qa column
        df['qa'] = df['question'] + '|' + df['answers']

        topic_mapping = {
            "1 The particulate nature of matter": 0,
            "2 Experimental techniques" : 1,
            "3 Atoms, elements and compounds": 2,
            "4 Stoichiometry": 3,
            "5 Electricity and chemistry": 4,
            "6 Chemical energetics": 5,
            "7 Chemical reactions": 6,
            "8 Acids, bases and salts": 7,
            "9 The Periodic Table": 8,
            "10 Metals": 9,
            "11 Air and water": 10,
            "12 Sulfur": 11,
            "13 Carbonates": 12,
            "14 Organic chemistry": 13,
        }
        label = []
        for topic in df['topic']:
            label.append(topic_mapping[topic])

        self.text_list = df['qa'].values.tolist()
        self.label = label
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
    
        

    def __len__(self):
        return len(self.text_list)


    def __getitem__(self, idx):
        """
        1. get text at index idx from self.text_list
        2. tokenizer & encode this text
        3. return this along with the label at index idx 
        """
        text = self.text_list[idx]
        encoded_inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        encoded_inputs = {k: v.squeeze(0) for k, v in encoded_inputs.items()}

        label = self.label[idx]
        return label, encoded_inputs
        
        

### Quick Test

In [None]:
dataset = TextDataset(sheet_id, sheet_name)

In [None]:
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
next(iter(train_loader))[1]['input_ids'].shape

torch.Size([2, 128])

## Model

In [None]:
from torch import nn
from transformers import BertModel


class MyModel(nn.Module):
    def __init__(self, out_dim):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.layer1 = nn.Linear(768, out_dim)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sentence_emb = output['last_hidden_state'][:, 0, :]  # [batch size, num hidden dim]

        return self.layer1(sentence_emb)



## Training & Evaluate

In [None]:
from tqdm.notebook import tqdm
import torch
import numpy as np

def train(model, dataloader, test_loader, loss, lr, num_epochs, save_dir='/content/drive/MyDrive/models'):
    # pytorch training loop
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    history = []
    for epoch in range(num_epochs):
        pbar = tqdm(dataloader)
        for mini_batch in pbar:  
            y, x = mini_batch
            x = {k: v.to('cuda') for k, v in x.items()}
            y = y.to('cuda')
            h = model(x['input_ids'], x['attention_mask'])
            j = loss(h, y)
            
            # do gradient descent
            optimizer.zero_grad()  # remove junk from last step
            j.backward()   # calculate gradient from current batch outputs
            optimizer.step()  # update the weights using the gradients
        
        acc = accuracy(model, test_loader)
        history.append(acc)
        torch.save(model.state_dict(), f'{save_dir}/{epoch}.pth')
    
    return history


def accuracy(model, test_loader):
    pred = []
    label = []
    for mini_batch in test_loader:
        y, x = mini_batch
        x = {k: v.to('cuda') for k, v in x.items()}
        h = model(x['input_ids'], x['attention_mask'])

        pred.append(h.cpu().detach().numpy())
        label.append(y.numpy())

    pred = np.concatenate(pred, 0)
    label = np.concatenate(label, 0)

    pred = np.argmax(pred, 1)

    return (pred == label).sum() / label.shape[0]

# Do Training!

In [None]:
# hyper parameters
lr = 1e-5
num_epochs = 3
out_dim = 14
batch_size = 32
max_len = 128

In [None]:
model = MyModel(out_dim).to('cuda')

train_dataset = TextDataset(sheet_id, sheet_name, 128)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(sheet_id, sheet_name, 128)
test_loader = DataLoader(test_dataset, batch_size=batch_size)  
loss = nn.CrossEntropyLoss()


history = train(model, train_loader, test_loader, loss, lr, num_epochs)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

## Load Saved Model

In [None]:
model = MyModel(out_dim)

model.load_state_dict(torch.load('/content/drive/MyDrive/models/0.pth'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>