<a href="https://colab.research.google.com/github/cronus6w6/AI-CUP-2020/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#準備

##安裝及引入

In [None]:
!pip install transformers

In [None]:
from transformers import *
import torch
from torch import nn
import shutil
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

## 參數調整

In [None]:
train_data_path = "trainset.csv"  # train dataset path
LABELS = ["THEORETICAL", "ENGINEERING", "EMPIRICAL", "OTHERS"]
Epochs = 5
lr = 1e-5
thrld = [0.35, 0.3, 0.25, 0.35]
positive_weights = [1., 1., 1.75, 7.5]
dropout = 0.2
hidden_unit = 64
# seed = random.randint(0, 100000)
seed = 29231

##初始化設定

In [None]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#資料處理

宣告資料集類別

In [None]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
class _DataSet(torch.utils.data.Dataset):
  def __init__(self, inp_data: pd.DataFrame):
    title_indices = []
    title_segments = []
    abstract_indices = []
    abstract_segments = []
    self.return_labels = "Classifications" in inp_data.columns
    if self.return_labels:
      labels = []
    for _, row in tqdm(inp_data.iterrows(), total=len(inp_data)):
      title_index = tokenizer.encode(row.Title, max_length=512, padding="max_length")
      abstract_index = tokenizer.encode(row.Abstract, max_length=512, padding="max_length", truncation=True)
      title_indices.append(title_index)
      abstract_indices.append(abstract_index)
      if self.return_labels:
        labels.append(list(map(lambda l: 1 if l in row.Classifications.split(" ") else 0, LABELS)))
        self.labels = torch.tensor(labels, dtype=torch.float32, device=device)
    self.title_indices = torch.tensor(title_indices, dtype=torch.long, device=device)
    self.title_segments = torch.zeros(self.title_indices.size(), dtype=torch.long, device=device)
    self.abstract_indices = torch.tensor(abstract_indices, dtype=torch.long, device=device)
    self.abstract_segments = torch.zeros(self.abstract_indices.size(), dtype=torch.long, device=device)
  def __getitem__(self, index):
    if self.return_labels:
      return ({
        "title_indices": self.title_indices[index],
        "title_segments": self.title_segments[index],
        "abstract_indices": self.abstract_indices[index],
        "abstract_segments": self.abstract_segments[index]
      }, self.labels[index])
    return {
      "title_indices": self.title_indices[index],
      "title_segments": self.title_segments[index],
      "abstract_indices": self.abstract_indices[index],
      "abstract_segments": self.abstract_segments[index]
    }
  def __len__(self):
    return len(self.title_indices)

切詞

In [None]:
trainset = pd.read_csv(train_data_path)
trainset.Abstract = trainset.Abstract.str.replace("\$\$\$", " ")
train_data = _DataSet(trainset.iloc[:6300])
val_data = _DataSet(trainset.iloc[6300:])

製作batch loader

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_data, 5)
val_data_loader = torch.utils.data.DataLoader(val_data, 5)

# 模型製作

In [None]:
class MultiClassificationModel(nn.Module):
  def __init__(self, encoder):
    super(MultiClassificationModel, self).__init__()
    self.encoder = encoder
    self.classifier = nn.Sequential(
        nn.Linear(768 * 2, hidden_unit),
        nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(hidden_unit, 4)
    )

  def forward(self, title_indices, title_segments, abstract_indices, abstract_segments):
    title_embs = self.encoder(title_indices, token_type_ids=title_segments)[0][:, 0, :]
    abstract_embs = self.encoder(abstract_indices, token_type_ids=abstract_segments)[0][:, 0, :]
    
    embs = torch.cat([title_embs, abstract_embs], 1)
    result = self.classifier(embs)
    return result

In [None]:
scibert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
model = MultiClassificationModel(scibert)
model.to(device)

#訓練

F1 Score 計算函數

In [None]:
def micro_f1_score(pred, label):
    
    TP = torch.mul(pred, label)[0]
    FP = (torch.mul(pred, (label-1)) != 0)[0]
    FN = (torch.mul(pred-1, label) != 0)[0]
    
    precision = TP.sum() / (TP.sum() + FP.sum())
    recall = TP.sum() / (TP.sum() + FN.sum())
    
    f1 = 2 * precision * recall / (precision + recall)
    
    return f1, TP, FP, FN

In [None]:
pos_weight = torch.FloatTensor(positive_weights).to(device)
total_step = len(train_data_loader) * Epochs
warmup_step = total_step // 2

optimizer =  AdamW(model.parameters(), lr=lr, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps= warmup_step, num_training_steps=total_step)
best_f1 = -1

In [None]:
for epoch in range(Epochs):
  print(f"Epoch {epoch}:")
  
  # Train
  model = model.train()
  criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
  thrld = np.array(thrld)
  total_loss = 0.0
  train_metric = { "TP":np.zeros(4), "FP":np.zeros(4), "FN":np.zeros(4), "F1":[] }
  thrld_ten = torch.from_numpy(thrld).float().to(device)
  train_metric["TP"] = torch.from_numpy(train_metric["TP"]).float().to(device)
  train_metric["FP"] = torch.from_numpy(train_metric["FP"]).float().to(device)
  train_metric["FN"] = torch.from_numpy(train_metric["FN"]).float().to(device)

  optimizer.zero_grad()
  for features, labels in tqdm(train_data_loader, total=len(train_data_loader), desc="Training..."):
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    result = model(**features)
    l = criterion(result, labels)
    total_loss += l.item()
    l.backward()
    
    result = torch.sigmoid(result)
    pred = (result > thrld_ten.expand(labels.size())).float()
    f1, tp, fp, fn = micro_f1_score(pred, labels)

    train_metric["F1"].append(f1)
    train_metric["TP"] += tp.float()
    train_metric["FP"] += fp.float()
    train_metric["FN"] += fn.float()

  optimizer.step()
  scheduler.step()

  train_precision_all = train_metric["TP"].sum().item() / (train_metric["TP"].sum().item() + train_metric["FP"].sum().item())
  train_recall_all = train_metric["TP"].sum().item() / (train_metric["TP"].sum().item() + train_metric["FN"].sum().item())
  train_micro_f1 = (2 * train_precision_all * train_recall_all) / (train_precision_all + train_recall_all)

  avg_loss = total_loss / len(train_data_loader)

  print("Train Loss:{}\tmicro_f1:{}".format(avg_loss, train_micro_f1))
  print("micro_f1s:", end="")
  for i in range(4):
    precision = train_metric["TP"][i].item() / (train_metric["TP"][i].item()+train_metric["FP"][i].item()+1e-10)
    recall = train_metric["TP"][i].item() / (train_metric["TP"][i].item()+train_metric["FN"][i].item()+1e-10)
    print("{}".format(2*precision*recall / (precision+recall+1e-10)), end="\t")

  print("")

  #Evaluation
  model = model.eval()
  criterion = nn.BCEWithLogitsLoss()
  
  dev_loss = 0.0
  dev_metric = { "TP":np.zeros(4), "FP":np.zeros(4), "FN":np.zeros(4), "F1":[] }
  dev_metric["TP"] = torch.from_numpy(dev_metric["TP"]).float().to(device)
  dev_metric["FP"] = torch.from_numpy(dev_metric["FP"]).float().to(device)
  dev_metric["FN"] = torch.from_numpy(dev_metric["FN"]).float().to(device)

  with torch.no_grad():
    for features, labels in tqdm(val_data_loader, total=len(val_data_loader), desc="Training..."):
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()

      result = model(**features)
      l = criterion(result, labels)
      dev_loss += l
      
      result = torch.sigmoid(result)

      pred = (result > thrld_ten.expand(labels.size())).float()
      f1, tp, fp, fn = micro_f1_score(pred, labels)

      dev_metric["F1"].append(f1)
      dev_metric["TP"] += tp.float()
      dev_metric["FP"] += fp.float()
      dev_metric["FN"] += fn.float()


  dev_precision_all = dev_metric["TP"].sum().item() / (dev_metric["TP"].sum().item() + dev_metric["FP"].sum().item())
  dev_recall_all = dev_metric["TP"].sum().item() / (dev_metric["TP"].sum().item() + dev_metric["FN"].sum().item())
  dev_micro_f1 = (2 * dev_precision_all * dev_recall_all) / (dev_precision_all + dev_recall_all)

  avg_loss = dev_loss / len(train_data_loader)

  print("Train Loss:{}\tmicro_f1:{}".format(avg_loss, dev_micro_f1))
  print("micro_f1s:", end="")
  for i in range(4):
    precision = dev_metric["TP"][i].item() / (dev_metric["TP"][i].item()+dev_metric["FP"][i].item()+1e-10)
    recall = dev_metric["TP"][i].item() / (dev_metric["TP"][i].item()+dev_metric["FN"][i].item()+1e-10)
    print("{}".format(2*precision*recall / (precision+recall+1e-10)), end="\t")

  print("")
  if dev_micro_f1 > best_f1:
    best_f1 = dev_micro_f1
print("Best F1: {}".format(best_f1))

In [None]:
torch.save(model.state_dict(), "model_state_{}".format(seed))