In [1]:
# all the necessary imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torch
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# set the seed
manual_seed = 572
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [3]:
# Using bert-base-uncased

import transformers

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 10
BERT_PATH = 'bert-base-uncased'
MODEL_PATH = 'model.bin'
TRAINING_FILE = '/content/drive/MyDrive/data/yelp_review/train.tsv'
VAL_FILE = '/content/drive/MyDrive/data/yelp_review/val.tsv'
TEST_FILE = '/content/drive/MyDrive/data/yelp_review/test.tsv'


# initializing bert tokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
  )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Defining BERT Model

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 5)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [5]:
# Defining dataloader

class BERTDataset:
    def __init__(self, content, rating):
        self.content = content
        self.rating = rating
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.content)

    def __getitem__(self, item):
        content = str(self.content[item])
        content = ' '.join(content.split())

        inputs = self.tokenizer.encode_plus(
            content,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(
                token_type_ids, dtype=torch.long
                ),
            'ratings': torch.tensor(self.rating[item], dtype=torch.float)
        }

In [6]:
# Defining loss fn and training fn

from tqdm import tqdm

def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d['ids']
        token_type_ids = d['token_type_ids']
        mask = d['mask']
        ratings = d['ratings']

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        ratings = ratings.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(outputs, ratings)
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(data_loader, model, device):
    model.eval()
    fin_ratings = []
    fin_outputs = []

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d['ids']
            token_type_ids = d['token_type_ids']
            mask = d['mask']
            ratings = d['ratings']

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            ratings = ratings.to(device, dtype=torch.long)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )

            outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1)
            fin_ratings.extend(ratings.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_ratings


In [7]:
import torch
from copy import deepcopy
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import metrics
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder

def run():
    best_model = None
    df_train = pd.read_csv(TRAINING_FILE, delimiter='\t').fillna('none')
    df_valid = pd.read_csv(VAL_FILE, delimiter='\t').fillna('none')

    le = LabelEncoder()
    df_train.rating = le.fit_transform(df_train.rating)
    df_valid.rating = le.transform(df_valid.rating)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(
        content=df_train.content.values,
        rating=df_train.rating.values
    )

    valid_dataset = BERTDataset(
        content=df_valid.content.values,
        rating=df_valid.rating.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'layerNorm.weight']
    optimizer_parameters = [
        {'params': [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
         'weight_decay': 0.001
        },
        {'params': [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
         'weight_decay': 0.0
        }
    ]

    num_train_steps = int(
        len(df_train) / TRAIN_BATCH_SIZE * EPOCHS
    )
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    best_f1score = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(valid_data_loader, model, device)

        accuracy = metrics.accuracy_score(targets, outputs)
        f1score = metrics.f1_score(targets, outputs, average='macro')

        print(f'Accuracy Score = {accuracy}')
        print(f'F1 (macro avg) Score = {f1score}')

        if f1score > best_f1score:
            torch.save(model.state_dict(), MODEL_PATH)
            best_f1score = f1score
            best_model = deepcopy(model)

    return best_model

In [None]:
run()

100%|██████████| 7000/7000 [46:54<00:00,  2.49it/s]
100%|██████████| 1750/1750 [02:08<00:00, 13.67it/s]


Accuracy Score = 0.6405714285714286
F1 (macro avg) Score = 0.6260333945818849


  6%|▌         | 427/7000 [02:52<44:44,  2.45it/s]

You can adap these two functions for your model.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train(loader):
    total_loss = 0.0
    # iterate throught the data loader
    num_sample = 0
    for batch in loader:
        # load the current batch
        batch_input = batch.review
        batch_output = batch.label

        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)
        # forward propagation
        # pass the data through the model
        model_outputs = model(batch_input)
        # compute the loss
        cur_loss = criterion(model_outputs, batch_output)
        total_loss += cur_loss.cpu().item()

        # backward propagation (compute the gradients and update the model)
        # clear the buffer
        optimizer.zero_grad()
        # compute the gradients
        cur_loss.backward()
        # update the weights
        optimizer.step()

        num_sample += batch_output.shape[0]

    return total_loss/num_sample

# evaluation logic based on classification accuracy
def evaluate(loader):
    all_pred=[]
    all_label = []
    with torch.no_grad(): # impacts the autograd engine and deactivate it. reduces memory usage and speeds up computation
        for batch in loader:
             # load the current batch
            batch_input = batch.review
            batch_output = batch.label

            batch_input = batch_input.to(device)
            # forward propagation
            # pass the data through the model
            model_outputs = model(batch_input)
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(batch_output.cpu())

    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro')
    return accuracy,f1score

In [None]:
# funtion for save prediction
def out_prediction(first_name, last_name, prediction_list):
    """
    out_prediction takes three input varibles: first_name, last_name, prediction_list
    <first_name>, string, your first name, e.g., Tom
    <last_name>, string, your last name, e.g., Smith
    <prediction_list>, list of string which includes all your predications of TEST samples
                        e.g., ['1star','5star','3star']

    Generate a file is named with <yourfirstname>_<yourlastname>_PRED.txt in current directory
    """
    output_file = open("{}_{}_PRED.txt".format(first_name,last_name),'w')
    for item in prediction_list:
        output_file.write(item+"\n")
    output_file.close()

# Please write code to develop you system. More details are in `Lab4.ipynb`.

Mounted at /content/drive
