In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sheet_id = "1kBPSqge0PG6TlHsk46MDq1XD-s2UNeceRxEGqGOTOgo"
sheet_name = 'Sheet1'

data_path = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
print(data_path)

In [None]:
topics = ["1 The particulate nature of matter",
          "2 Experimental techniques",
          "3 Atoms, elements and compounds",
          "4 Stoichiometry",
          "5 Electricity and chemistry",
          "6 Chemical energetics",
          "7 Chemical reactions",
          "8 Acids, bases and salts",
          "9 The Periodic Table",
          "10 Metals",
          "11 Air and water",
          "12 Sulfur",
          "13 Carbonates",
          "14 Organic chemistry"]

# Definitions

## DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, RobertaTokenizer
import pandas as pd

class TextDataset(Dataset):
    def __init__(self, df, max_length=128, use_roberta=False):
        """["sent1", "sent2", ....]"""
        topic_mapping = {top: i for i, top in enumerate(topics)}
        label = []
        for topic in df['topic']:
            label.append(topic_mapping[topic])

        self.text_list = df['text'].values.tolist()
        self.label = label
        if use_roberta:
            self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        else:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length

    def __len__(self):
        return len(self.text_list)

    def __getitem__(self, idx):
        """
        1. get text at index idx from self.text_list
        2. tokenizer & encode this text
        3. return this along with the label at index idx 
        """
        text = self.text_list[idx]
        encoded_inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        encoded_inputs = {k: v.squeeze(0) for k, v in encoded_inputs.items()}

        label = self.label[idx]
        return label, encoded_inputs
        
        

## Model

In [None]:
from torch import nn
from transformers import BertModel, RobertaModel


class MyModel(nn.Module):
    def __init__(self, out_dim, use_roberta=False):
        super().__init__()
        if use_roberta:
            self.bert = RobertaModel.from_pretrained('roberta-base')
        else:
            self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.layer1 = nn.Linear(768, out_dim)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sentence_emb = output['last_hidden_state'][:, 0, :]  # [batch size, num hidden dim]

        return self.layer1(sentence_emb)



## Training & Evaluate

In [None]:
from tqdm.notebook import tqdm
import torch
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

def train(model, dataloader, test_loader, loss, lr, num_epochs, save_dir='/content/drive/MyDrive/models', subject='chemistry'):
    # pytorch training loop
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_score = 0
    history = {'accuracy': [], 'auc': []}
    for epoch in range(num_epochs):
        pbar = tqdm(dataloader)
        model.train()
        for mini_batch in pbar:  
            y, x = mini_batch
            x = {k: v.to('cuda') for k, v in x.items()}
            y = y.to('cuda')
            h = model(x['input_ids'], x['attention_mask'])
            j = loss(h, y)
            
            # do gradient descent
            optimizer.zero_grad()  # remove junk from last step
            j.backward()   # calculate gradient from current batch outputs
            optimizer.step()  # update the weights using the gradients

        model.eval()
        acc, auc = evaluate(model, test_loader)
        history['accuracy'].append(acc)
        history['auc'].append(auc)

        if acc > best_score:
            best_score = acc
            torch.save(model.state_dict(), f'{save_dir}/{subject}_best.pth')
    
    return history

@torch.no_grad()
def infer(model, loader, threshold=None):
    probs = []
    label = []
    for mini_batch in loader:
        y, x = mini_batch
        x = {k: v.to('cuda') for k, v in x.items()}
        h = torch.softmax(model(x['input_ids'], x['attention_mask']), -1)

        if threshold is not None:
            idx = h.max(-1)[0] >= threshold
            h = h[idx]
            y = y[idx]

        probs.append(h.cpu().detach().numpy())
        label.append(y.numpy())

    probs = np.concatenate(probs, 0)
    label = np.concatenate(label, 0)
    
    return probs, label
    
@torch.no_grad()
def evaluate(model, test_loader, threshold=None):
    probs, label = infer(model, test_loader, threshold=threshold)
    pred = np.argmax(probs, 1)

    accuracy = accuracy_score(label, pred)
    if threshold is not None:
        return accuracy, probs, pred, label

    auc = roc_auc_score(label, probs, multi_class='ovr')
    print(classification_report(label, pred, target_names=topics))    
    return accuracy, auc

# Do Training!

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import os

pmt_df = pd.read_csv('pmt_train.csv').drop_na()

if os.path.isfile('all_data.csv'):
    manual_df = pd.read_csv('all_data.csv')
    manual_df = manual_df[~manual_df['text'].isna() & ~manual_df['topic'].isna()]

    df = pd.concat([pmt_df, manual_df])
else:
    df = pmt_df

idx = np.arange(len(df))
train_idx, test_idx = train_test_split(idx, test_size=0.1, stratify=df['topic'].values)

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

print(len(train_df), len(test_df))
print()
train_df['topic'].value_counts()

In [None]:
# hyper parameters
lr = 1e-5
num_epochs = 10
batch_size = 32
max_len = 128
use_roberta = False

# Subject Related
out_dim = 14
subject='chemistry'

In [None]:
import warnings
warnings.filterwarnings("ignore")

model = MyModel(out_dim, use_roberta=use_roberta).to('cuda')

train_dataset = TextDataset(train_df, max_len, use_roberta=use_roberta)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(test_df, max_len, use_roberta=use_roberta)
test_loader = DataLoader(test_dataset, batch_size=batch_size)  
loss = nn.CrossEntropyLoss()


history = train(model, train_loader, test_loader, loss, lr, num_epochs, subject=subject)

In [None]:
print(history)

In [None]:
acc, probs, pred, label = evaluate(model, test_loader, threshold=0.9)

In [None]:
acc, np.unique(pred), np.unique(label), len(pred), len(test_dataset)

## Load Saved Model

In [None]:
model = MyModel(out_dim)

model.load_state_dict(torch.load(f'/content/drive/MyDrive/models/{subject}_best.pth'))