In [None]:
# Needed imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from typing import Tuple, List
from functools import partial

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import GradScaler, autocast

! pip install transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from tqdm import tqdm
import itertools
import time
import seaborn as sns
import sys
import os

In [None]:
# Define dataset class
class QuoteDataset(Dataset):
    def __init__(self, tokenizer: BertTokenizer, dataframe: pd.DataFrame, class_columns: list, q_length: int = 300, lazy: bool = False):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.class_columns = class_columns
        self.q_length = q_length
        self.lazy = lazy
        if not self.lazy:
            self.X = []
            self.Y = []
            for i, (row) in tqdm(dataframe.iterrows()):
                x, y = self.row_to_tensor(self.tokenizer, row, self.class_columns, self.q_length)
                self.X.append(x)
                self.Y.append(y)
        else:
            self.df = dataframe        
    
    @staticmethod
    def row_to_tensor(tokenizer: BertTokenizer, row: pd.Series, class_columns: list, q_length: int = 300) -> Tuple[torch.LongTensor, torch.LongTensor]:
        tokens = tokenizer.encode(row["quotation"], add_special_tokens=True)
        if len(tokens) > q_length:
            tokens = tokens[:q_length-1] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor(row[class_columns])
        return x, y
        
    
    def __len__(self):
        if self.lazy:
            return len(self.df)
        else:
            return len(self.X)

    def __getitem__(self, index: int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        if not self.lazy:
            return self.X[index], self.Y[index]
        else:
            return self.row_to_tensor(self.tokenizer, self.df.iloc[index], self.class_columns, self.q_length)
            

def collate_fn(batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) \
        -> Tuple[torch.LongTensor, torch.LongTensor]:
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

In [None]:
# Define model class
class BertClassifier(nn.Module):
    def __init__(self, bert: BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None, criterion=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
        criterion = criterion if criterion != None else nn.BCEWithLogitsLoss()
        loss = -1.
        if labels is not None:
            loss = criterion(cls_output, labels)
        return loss, cls_output

def train(model, iterator, optimizer, scheduler, criterion=None):
    model.train()
    total_loss = 0
    scaler = GradScaler()
    for x, y in tqdm(iterator):
        with autocast():
            mask = (x != 0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y, criterion=criterion)
            loss = loss / iters_to_accumulate
            with torch.no_grad():
                total_loss += loss.item()
        if next(model.parameters()).is_cuda:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
    print(f"Train loss {total_loss / len(iterator)}")

def evaluate(model, iterator, class_columns, criterion=None):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator):
            mask = (x != 0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y, criterion=criterion)
            outputs = torch.sigmoid(outputs)
            total_loss += loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)
    # print ...
    print(f"Amount of eval. samples: {np.sum(true)}")
    for i, name in enumerate(class_columns):
        amount_i = np.sum(true[:, i])
        print(f"Amount of {name}: {amount_i}")
        if amount_i > 1 and amount_i < true.shape[0] - 1:
            print(f"{name} roc_auc {roc_auc_score(true[:, i], pred[:, i])}")
        else:
            print(f"{name} roc_auc ---")
    print(f"Evaluate loss {total_loss / len(iterator)}")
    return total_loss.cpu().numpy() / len(iterator), true, pred

def save_model(model, optimizer, scheduler, eval_loss, epoch, chunk_number, delta_time):
    # save model and optimizer params
    print(f"Start saving the model: epoch {epoch}, chunk_number {chunk_number}, time {delta_time}")
    PATH = "model_state_dict"
    torch.save({
                'time': delta_time,
                'epoch': epoch,
                'chunk_number': chunk_number,
                'eval_loss': eval_loss,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict()
                }, PATH)
    print("The model is saved")

In [None]:
# Define useful functions
def set_seed(seed=37):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
def add_dummies(chunk, class_columns):
    df = chunk[['quotation', 'qids', 'occupation', 'class']].copy()
    df = df.dropna()
    df = pd.concat([df.reset_index(drop=True), pd.get_dummies(df['class'].reset_index(drop=True), prefix='class')], axis=1)
    for cl in class_columns:
        if cl not in df.columns:
            df[cl] = df['class'] * 0
    return df.copy()

We decided to make the balanced train and test datasets to prevent the model overfits to the most frequent classes and do correct model validation

In [None]:
# split all data into train and validation (test) parts per each year and save it in files
# We remove quotations with length less than 'q_length_min' letters (50 letters)

start_time = time.time()
for year in years:
    header_train = True
    header_val = True
    total_len_train = 0
    total_len_val = 0
    print('=' * 50, f"YEAR {year}", '=' * 50)
    path_to_file = "filtered_occupancy_"+ year + ".csv.bz2"     # the files were provided by my colleague
    df_reader = pd.read_csv(path_to_file, compression='bz2', chunksize=chunksize)
    chunk_number = 0
    for chunk in df_reader:
        chunk_number+=1
        print('=' * 50, f"CHUNK {chunk_number}", '=' * 50)
        print("chunk shape:", chunk.shape)
        # remove quotations with length less than 'q_length_min' letters
        chunk = chunk.drop(chunk[chunk['quotation'].apply(lambda x: len(x)) <= q_length_min].index)
        print("chunk shape after dropping short quotes:", chunk.shape)
        if chunk.shape[0] == 0:
            print("continue with next chunk")
            continue
        # set random state and split the data into train and validation (test) parts with ratio 'test_size_splitting'
        set_seed()
        train_df, val_df = train_test_split(chunk, test_size=test_size_splitting)

        # save train part
        train_df.reset_index(drop=True, inplace=True)
        train_df.index = train_df.index + total_len_train
        train_df.to_csv("train_10classes_filtered_occupancy_"+ year + ".csv.bz2", header=header_train, mode='a')
        if header_train:
            header_train = False
        print("len(train_df): ", len(train_df))
        total_len_train+= len(train_df)

        # save validation part
        val_df.reset_index(drop=True, inplace=True)
        val_df.index = val_df.index + total_len_val
        val_df.to_csv("test_10classes_filtered_occupancy_"+ year + ".csv.bz2", header=header_train, mode='a')
        if header_val:
            header_val = False
        print("len(val_df): ", len(val_df))
        total_len_val+= len(val_df)

        print('=' * 50, f"END CHUNK {chunk_number}", '=' * 50)
    print('=' * 50, f"END YEAR {year}", '=' * 50)
    delta_time = time.time() - start_time
    print('=' * 50, f"TIME {delta_time} sec", '=' * 50)

In [None]:
# create separate files with quotations per each class (for train and test data)

# create separate files for train data
start_time = time.time()
total_lens = [0]*len(class_columns)
headers = [True]*len(class_columns)
for year in years:
    header = True
    print('=' * 50, f"YEAR {year}", '=' * 50)
    df_reader = pd.read_csv("train_10classes_filtered_occupancy_"+ year + ".csv.bz2", compression='bz2', chunksize=chunksize)
    chunk_number = 0
    for chunk in df_reader:
        chunk_number+=1
        print('=' * 50, f"CHUNK {chunk_number}", '=' * 50)
        print("chunk shape:", chunk.shape)
        chunk['year'] = year
        for i, cls in enumerate(class_columns):
            chunk_i = chunk[chunk['class'] == i+1].copy()
            if len(chunk_i) == 0:
                continue
            chunk_i.reset_index(drop=True, inplace=True)
            chunk_i.index = chunk_i.index + total_lens[i]
            total_lens[i]+= len(chunk_i)
            chunk_i.to_csv("train_" + cls + "_filtered_occupancy.csv.bz2", header=headers[i], mode='a')
            if headers[i]:
                headers[i] = False
        print('=' * 50, f"END CHUNK {chunk_number}", '=' * 50)
    print('=' * 50, f"END YEAR {year}", '=' * 50)
    delta_time = time.time() - start_time
    print('=' * 50, f"TIME {delta_time} sec", '=' * 50)
print("total_lens:")
print(total_lens)

# >>> total_lens:
# >>> [1145315, 532178, 3079250, 90736, 75932, 5795132, 1003257, 80187, 72798, 338875]

# create separate files for test data
start_time = time.time()
total_lens = [0]*len(class_columns)
headers = [True]*len(class_columns)
for year in years:
    header = True
    print('=' * 50, f"YEAR {year}", '=' * 50)
    df_reader = pd.read_csv("test_10classes_filtered_occupancy_"+ year + ".csv.bz2", compression='bz2', chunksize=chunksize)
    chunk_number = 0
    for chunk in df_reader:
        chunk_number+=1
        print('=' * 50, f"CHUNK {chunk_number}", '=' * 50)
        print("chunk shape:", chunk.shape)
        chunk['year'] = year
        for i, cls in enumerate(class_columns):
            chunk_i = chunk[chunk['class'] == i+1].copy()
            if len(chunk_i) == 0:
                continue
            chunk_i.reset_index(drop=True, inplace=True)
            chunk_i.index = chunk_i.index + total_lens[i]
            total_lens[i]+= len(chunk_i)
            chunk_i.to_csv("test_" + cls + "_filtered_occupancy.csv.bz2", header=headers[i], mode='a')
            if headers[i]:
                headers[i] = False
        print('=' * 50, f"END CHUNK {chunk_number}", '=' * 50)
    print('=' * 50, f"END YEAR {year}", '=' * 50)
    delta_time = time.time() - start_time
    print('=' * 50, f"TIME {delta_time} sec", '=' * 50)
print("total_lens:")
print(total_lens)

# >>> total_lens:
# >>> [1143737, 531858, 3080498, 90205, 76068, 5798577, 1002162, 79443, 72576, 339995]

In [None]:
# the smallest class is class_9 with 72798 samples in train part and 72576 samples in test part
# final balanced train and test datasets will have 72000 samples per each class
class_min_size = 72000

In [None]:
# read the same amount of each class from train data
total_len = 0
dfs = []
for i, cls in enumerate(class_columns):
    print(cls)
    path_to_file = path_to_output + "train_" + cls + "_filtered_occupancy.csv.bz2"
    df = pd.read_csv(path_to_file, compression='bz2')
    # set random state and take 'class_min_size' indexes
    set_seed()
    indexes = np.arange(len(df))
    np.random.shuffle(indexes)
    indexes = indexes[:class_min_size]

    df = df.iloc[indexes]
    print("len(df): ", len(df))
    df.reset_index(drop=True, inplace=True)
    df.index = df.index + total_len
    dfs.append(df.copy())

# save balanced train dataset
header = True
for i in range(class_min_size//2):
    df = dfs[0].iloc[2*i : 2*(i+1)]
    for j in range(1, 10):
        df = pd.concat([df, dfs[j].iloc[2*i : 2*(i+1)]], axis=0)
    df.to_csv("train_balanced_72000x10.csv.bz2", header=header, mode='a')
    if header:
        header = False

## do the same for test data

# read the same amount of each class from test data
class_min_size = 72000
total_len = 0
dfs = []
for i, cls in enumerate(class_columns):
    print(cls)
    path_to_file = path_to_output + "test_" + cls + "_filtered_occupancy.csv.bz2"
    df = pd.read_csv(path_to_file, compression='bz2')
    # set random state and take 'class_min_size' indexes
    set_seed()
    indexes = np.arange(len(df))
    np.random.shuffle(indexes)
    indexes = indexes[:class_min_size]

    df = df.iloc[indexes]
    print("len(df): ", len(df))
    df.reset_index(drop=True, inplace=True)
    df.index = df.index + total_len
    dfs.append(df.copy())

# save balanced test dataset
header = True
for i in range(class_min_size//2):
    df = dfs[0].iloc[2*i : 2*(i+1)]
    for j in range(1, 10):
        df = pd.concat([df, dfs[j].iloc[2*i : 2*(i+1)]], axis=0)
    df.to_csv("test_balanced_72000x10.csv.bz2", header=header, mode='a')
    if header:
        header = False

In [None]:
# define parameters
num_of_classes = 10
class_columns = [f"class_{i}" for i in range(1, num_of_classes+1)]
years = [f"20{i}" for i in range(15, 21)]

chunksize = 10000
delta_weights = chunksize // 100    # a parameter needed to calculate class weights in case of unbalanced data 

BATCH_SIZE = 20
q_length = 400
q_length_min = 50

EPOCH_NUM = 2
n_chunk = data_size // chunksize + 1
n_batch = chunksize // BATCH_SIZE + 1
total_steps = int(n_chunk * n_batch) * EPOCH_NUM

warmup_steps = total_steps // 10
val_in_steps = 6

test_size_splitting = 0.5

In [None]:
# choose a device
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')

# set the model
bert_model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertClassifier(BertModel.from_pretrained(bert_model_name), num_of_classes).to(device)

# set an optimizer and a scheduler
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

In [None]:
## load previously trained model params (if needed)
# PATH = "model_state_dict"
# checkpoint = torch.load(PATH, map_location=device)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

In [None]:
# train the model
eval_loss = None
start_time = time.time()
val_step = 0
for epoch in range(EPOCH_NUM):
    print('=' * 50, f"EPOCH {epoch+1} / {EPOCH_NUM}", '=' * 50)
    path_to_file = "train_balanced_72000x10.csv.bz2"
    df_reader = pd.read_csv(path_to_file, compression='bz2', chunksize=chunksize)
    chunk_number = 0
    for chunk in df_reader:
        val_step+=1
        chunk_number+=1
        print('=' * 50, f"CHUNK {chunk_number}", '=' * 50)
        print("chunk shape:", chunk.shape)
        df = add_dummies(chunk, class_columns)        
        chunk = None # free the memory
        
        # print classes' percentages into the chunk 
        print((df[class_columns].sum() / df.shape[0] * 100).apply(lambda x: f"{x}") + ' %')
        
        # calculate class weights for loss function
        weights = ((df[class_columns]).sum() + delta_weights).to_numpy() / (len(df) + delta_weights*num_of_classes)
        weights = [1./w for w in weights]
        # normalize the weights 
        sum_weight = sum(weights)
        weights = [w/sum_weight for w in weights]
        print("weights:", weights)
        # set loss function: weighted binary cross entropy
        criterion = nn.BCEWithLogitsLoss(weight=torch.Tensor(weights).to(device))

        train_dataset = QuoteDataset(tokenizer, df, class_columns,  q_length=q_length, lazy=True)
        df = None # free the memory
        collate_fn = partial(collate_fn, device=device)
        train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
        train_dataset = None # free the memory

        train(model, train_iterator, optimizer, scheduler, criterion=criterion)
      
        delta_time = time.time() - start_time
        print('=' * 50, f"END CHUNK {chunk_number}", '=' * 50)        
        print('=' * 50, f"TIME {delta_time} sec", '=' * 50)

        if chunk_number % val_in_steps == 0:
            # validation with a part of test data (only first chunk)
            path_to_file_eval = path_to_input + "test_balanced_72000x10.csv.bz2"
            df_reader_eval = pd.read_csv(path_to_file_eval, compression='bz2', chunksize=chunksize)
            for chunk_eval in df_reader_eval:
                print("chunk_eval shape:", chunk_eval.shape)
                df_eval = add_dummies(chunk_eval, class_columns)
                chunk_eval = None # free the memory
                dev_dataset = QuoteDataset(tokenizer, df_eval, class_columns, q_length=q_length, lazy=True)
                dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

                eval_loss, true, pred = evaluate(model, dev_iterator, class_columns, criterion=criterion)
                break
            df_reader_eval = None # free the memory
            df_eval = None # free the memory
            delta_time = time.time() - start_time
            print('=' * 50, f"TIME {delta_time} sec", '=' * 50)
            # save the model
            save_model(model, optimizer, scheduler, eval_loss, epoch+1, chunk_number, delta_time)
    print('=' * 50, f"END EPOCH {epoch+1} / {EPOCH_NUM}", '=' * 50)

In [None]:
# final validation with all test data
path_to_file_eval = path_to_input + "test_balanced_72000x10.csv.bz2"
chunk_eval = pd.read_csv(path_to_file_eval, compression='bz2')
df_eval = add_dummies(chunk_eval, class_columns)
chunk_eval = None # free the memory
dev_dataset = QuoteDataset(tokenizer, df_eval, class_columns, q_length=q_length, lazy=True)
dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
eval_loss, true, pred = evaluate_return(model, dev_iterator, class_columns, criterion=criterion)

# save true and predicted classes to plot ROC curves and compute the confusion matrix later
np.save(path_to_output + f"true__test_72000x10_epoch_{epoch+1}", true)
np.save(path_to_output + f"pred__test_72000x10_epoch_{epoch+1}", pred)