In [1]:
# https://medium.com/udacity-pytorch-challengers/ideas-on-how-to-fine-tune-a-pre-trained-model-in-pytorch-184c47185a20

import re
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tqdm
import math
import ast
import nltk

from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from apex import amp, optimizers

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

COL_NAMES = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
             'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
             'AirlineNation', 'CrawlTime']

PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ASPECT_NAMES = ['LEG', 'SIT', 'ENT', 'CUS', 'VOM', 'CLE', 'CKI', 'FNB']
VOCAB_DIC = BertTokenizerFast.from_pretrained(PRE_TRAINED).get_vocab()
TOPN = 50


# This one is implemented with weight loss per class            
class BertBonzWeightLoss(BertModel):
    def __init__(self, config):
        super(BertBonzWeightLoss, self).__init__(config)
        self.config = config
        self.embeddings.llr_embeddings = nn.ModuleList(nn.Embedding(4, 768, 3) for _ in range(len(ASPECT_NAMES)))
        self.classifier = nn.Linear(768, config.num_aspect*3)
        self.init_weights()
        self.embeddings.llr_embeddings.apply(self._xavier)
        self.pooler.apply(self._xavier)
        self.classifier.apply(self._xavier)
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None,
                weight_loss=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        
        if llr_ids is not None:
            temp = [self.embeddings.llr_embeddings[i](llr_ids[:,i,:]) for i in range(self.config.num_aspect)]
            llr_embeddings = sum(temp)
        else:
            llr_embeddings = torch.zeros(inputs_embeds.size(), device=device).fill_(3).long()
        
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.classifier(CLS_token)
        
        loss_fn = nn.functional.cross_entropy
        if labels is not None:
            if weight_loss is None:
                loss = loss_fn(predict.view(input_shape[0], 3,-1), labels)
            else:
                loss = torch.tensor(0).float().to(DEVICE)
                for asp_i in range(len(ASPECT_NAMES)):
                    loss += loss_fn(predict.view(input_shape[0], 3,-1)[:,:,asp_i], labels[:,asp_i], weight_loss[asp_i, :])
                loss /= len(ASPECT_NAMES)
                    
            outputs = (predict.view(input_shape[0], 3,-1), loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict.view(input_shape[0], 3,-1), CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
    def llr_embed_pad(self):
        for i in range(len(self.embeddings.llr_embeddings)):
            temp = self.embeddings.llr_embeddings[i].weight.data
            temp[-1,:] = torch.zeros(temp.size(1))
        
    def _xavier(self, module):
        for name, param in module.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
            elif 'bias' in name:
                param.data.zero_()
                
    def unfreeze(self):
        for param in self.parameters():
            param.requires_grad = True
                
    def freeze(self):
        for param in self.parameters():
            param.requires_grad = False
        for param in self.embeddings.llr_embeddings.parameters():
            param.requires_grad = True
        for param in self.pooler.parameters():
            param.requires_grad = True
        for param in self.classifier.parameters():
            param.requires_grad = True            
    

class BonzDataset(Dataset):
    def __init__(self, data, llr_words):
        self.input_ids = torch.LongTensor(list(data.input_ids))
        self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        if 'labels' in data.columns:
            self.labels = torch.LongTensor(list(data.labels))
        else:
            self.labels = None
        self.llr_words = llr_words
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        '''
        tokens = self.data.input_ids[idx]
        
        llr_embedding = []
        for aspect in ASPECT_NAMES:
            temp = [3] * tokens.shape[0]
            for j in range(tokens.shape[0]):
                for class_, wordlist in llr_words[aspect].items():
                    if tokens[j] in wordlist:
                        temp[j] = class_
                        break
            llr_embedding.append(temp)
        
        llr_embedding = torch.stack([torch.LongTensor(i) for i in llr_embedding], 0)
        
        
        outputs = (torch.LongTensor(tokens), llr_embedding)
        
        if 'labels' in self.data.columns:
            outputs = (torch.LongTensor(tokens), llr_embedding, torch.LongTensor(self.data.labels[idx]))
        '''
        if self.labels is None:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx])
        else:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx], self.labels[idx])
        
        return outputs
    

    
def split_aspect(data):
    temp = np.full((8, data.shape[0]), 2, np.int)
    for idx in range(data.shape[0]):
        aspect = data[idx]
        for i, asp in enumerate(['Legroom', 'Seat', 'Entertainment', 'Customer', 'Value', 'Cleanliness', 'Check-in', 'Food']):
            for sub_asp in aspect:
                if asp in sub_asp:
                    pol = int(sub_asp[-1])
                    temp[i, idx] = 1 if pol > 3 else 0
                    break
    return temp
            

def tokenize_data(data):
    tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
    input_ids = tokenizer(list(data))['input_ids']
    input_ids = pad_sequences(input_ids, maxlen=512, padding='post', truncating='post')
    
    return (list(input_ids), tokenizer)
    
    
def get_data(FILE_PATH, COL_NAMES):
    raw_data = pd.read_csv(FILE_PATH, sep='\t', header=None, names=COL_NAMES)
    data = raw_data[['ReviewText', 'Rating', 'Aspects']]
    data = data[data['Aspects'] != 'No filling in'] # Filter none aspects
    data.Aspects = data.Aspects.str.split('|').values
    
    '''Split aspects to new columns'''
    aspects_splitted = split_aspect(data.Aspects.values)
    for i in range(len(ASPECT_NAMES)):
        data[ASPECT_NAMES[i]] = aspects_splitted[i,:]
        
    data['input_ids'], tokenizer = tokenize_data(data.ReviewText.values) # Generate input_ids from review text
    
    return data, tokenizer


def word_class_freq(data, aspect_name, aspect_class=3):
    temp = np.zeros((33000, aspect_class), np.int)
    ids = data.input_ids.values
    labels = data[aspect_name].values

    for sub_ids, sub_lb in zip(ids, labels):
        set_ids = set(sub_ids)
        for ids in set_ids:
            temp[ids, sub_lb] += 1
    
    return temp


def calculate_llr(temp_df, labels):
    N = data.shape[0]
    total_scores = []

    for i in temp_df.index.values:
        llr_scores = []
        for class_ in [0,1,2]:
            num_class_doc = np.sum(labels == class_)
            n11 = temp_df.loc[i, class_]
            n10 = num_class_doc - n11
            n01 = temp_df.loc[i, 'total'] - n11
            n00 = (N - n11 - n10 - n01)
            pt = (1e-10 + n11 + n01)/N
            p1 = n11/(1e-10 + n11 + n10)
            p2 = n01/(1e-10 + n01 + n00)


            try:
                e1 = n11 * (math.log(pt) - math.log(p1))
            except:
                e1 = 0
            try:
                e2 = n10 * (math.log(1-pt) - math.log(1-p1))
            except:
                e2 = 0
            try:
                e3 = n01 * (math.log(pt) - math.log(p2))
            except:
                e3 = 0
            try:
                e4 = n00 * (math.log(1-pt) - math.log(1-p2))
            except:
                e4 = 0

            llr_score = -2 * (e1+e2+e3+e4)
            if n11 < n01:
                llr_score = 0
            llr_scores.append(llr_score)

        total_scores.append(llr_scores)
    
    llr_df = pd.DataFrame(np.array(total_scores), index=temp_df.index, columns=temp_df.columns.values[:-1])

    return llr_df


def generate_llr_score(data, aspect):
    temp = word_class_freq(data, aspect)
    
    temp_df = pd.DataFrame(temp)
    temp_df['total'] = np.sum(temp, -1)
    temp_df = temp_df[temp_df['total'] != 0]
    temp_df = temp_df.drop(0,0)
    
    return calculate_llr(temp_df, data[aspect].values)

Using TensorFlow backend.


# LOAD PRE-PROCESSED DATA (IF ANY)

In [2]:
# Load n process read data
data = pd.read_csv('./data/pre-processed_50.csv', sep='\t', index_col=0)

for col in tqdm.notebook.tqdm(['input_ids', 'labels', 'llr_embeddings']):
    data[col] = [ast.literal_eval(i) for i in tqdm.notebook.tqdm(data[col].values)]


# CALCULATE WEIGHT LOSS
labels = pd.DataFrame([i for i in data.labels])
beta = 0.9999
#beta = (data.shape[0] - 1) / data.shape[0]
weight_loss = []

for i in labels:
    n_sample = labels.loc[:, i].value_counts(0, 0).values
    n_sample = 1.0 - np.power(beta, n_sample)
    n_sample = (1.0 - beta) / n_sample
    n_sample = n_sample / np.sum(n_sample)
    weight_loss.append(n_sample)

weight_loss = torch.tensor(weight_loss, device=DEVICE).float()
weight_loss

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=42867.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42867.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42867.0), HTML(value='')))





tensor([[0.0924, 0.0643, 0.8433],
        [0.0776, 0.0554, 0.8670],
        [0.2724, 0.2198, 0.5078],
        [0.1024, 0.0561, 0.8415],
        [0.1517, 0.0956, 0.7527],
        [0.4734, 0.1872, 0.3394],
        [0.4486, 0.1954, 0.3559],
        [0.3372, 0.2571, 0.4057]], device='cuda:0')

# GET DATA

In [2]:
data, tokenizer = get_data('./data/data_v3.txt', COL_NAMES)
data['labels'] = list(data.iloc[:, 3:11].values)

# CALCULATE WEIGHT LOSS
labels = pd.DataFrame([i for i in data.labels])
beta = 0.9999
#beta = (data.shape[0] - 1) / data.shape[0]
weight_loss = []

for i in labels:
    n_sample = labels.loc[:, i].value_counts(0, 0).values
    n_sample = 1.0 - np.power(beta, n_sample)
    n_sample = (1.0 - beta) / n_sample
    n_sample = n_sample / np.sum(n_sample)
    weight_loss.append(n_sample)

weight_loss = torch.tensor(weight_loss, device=DEVICE).float()

data

Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels
0,With everyone trying to get home in the Covid ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999...","[1, 1, 1, 1, 1, 1, 1, 1]"
1,"Ad a lot of people did, we had to scramble to ...",5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]"
2,After coming into Changi airport and worrying ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199...","[1, 1, 1, 1, 1, 1, 1, 1]"
3,"Great service, great plane, great pricing. We ...",4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307...","[1, 1, 1, 1, 1, 1, 1, 1]"
4,My husband and I were to fly home from Houston...,1,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875...","[1, 1, 1, 0, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190981,We booked to fly from Heathrow to Newark. The ...,1,"[Legroom:2, Seat comfort:2, In-flight Entertai...",0,0,0,0,0,2,2,2,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105...","[0, 0, 0, 0, 0, 2, 2, 2]"
190982,"Love Virgin, great staff, food good, quality o...",5,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833...","[1, 1, 1, 1, 1, 2, 2, 2]"
190983,"Virgin upper class is outstanding, really very...",5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428...","[1, 1, 1, 1, 1, 2, 2, 2]"
190984,Virgins premium economy is the best I have com...,5,"[Legroom:3, Seat comfort:5, In-flight Entertai...",0,1,1,0,0,2,2,2,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219...","[0, 1, 1, 0, 0, 2, 2, 2]"


# CALCULATE LLR SCORES & WORDLIST

In [None]:
# Stopwords in English
stopwords_ids = tokenizer.convert_tokens_to_ids(stopwords.words('english'))

llr_scores = {}

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Calculate LLR scores'):
    llr_df = generate_llr_score(data, aspect)
    
    # Clear stopword ids
    llr_df = llr_df.drop(stopwords_ids, 0)
    
    llr_scores[aspect] = llr_df


llr_words = dict()

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Generate top LLR words'):
    kw_label = dict()
    for class_ in [0,1,2]:
        # Sort keywords based on aspect, class and top_n words
        kw_list = list(llr_scores[aspect][class_].sort_values(ascending=False)[:TOPN].index)
        
        kw_label[class_] = kw_list
        
    llr_words[aspect] = kw_label

llr_embedding_list = []

for idx in tqdm.notebook.tqdm(data.index):
    tokens = data.input_ids[idx]
    
    llr_embedding = []
    for aspect in ASPECT_NAMES:
        temp = [3] * tokens.shape[0]
        for j in range(tokens.shape[0]):
            for class_, wordlist in llr_words[aspect].items():
                if tokens[j] in wordlist:
                    temp[j] = class_
                    break
        llr_embedding.append(temp)
    
    llr_embedding_list.append(llr_embedding)

#data['llr_embeddings'] = [[[0]*512]*8] * data.shape[0]
data['llr_embeddings'] = llr_embedding_list

# Turn numpy array to list to store easier
for i in data.keys()[-3:]:
    data[i] = data[i].map(list)
    
data

HBox(children=(FloatProgress(value=0.0, description='Calculate LLR scores', max=8.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Generate top LLR words', max=8.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=152574.0), HTML(value='')))

In [None]:
data.iloc[:,-3:].to_csv('./data/pre-processed_50_v3.csv', sep='\t')
data.iloc[:1000,-3:].to_csv('./data/sample_50_v3.csv', sep='\t')

# INITATE MODEL

In [3]:
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_aspect = len(ASPECT_NAMES)
model = BertBonzWeightLoss(config)
model.to(DEVICE)

model.load_pretrained_weight() # Load pre-trained BERT weights for BERT's layers 
model.llr_embed_pad() # Set LLR embedding padding idx to 0-value tensor



''' Using apex for faster training
optimizer_list = []
for i in range(10):
    optimizer_list.append(AdamW(model.parameters(), lr=3e-5, correct_bias=False))

model = amp.initialize(model, opt_level="O2", verbosity=0)
''' 

''' Save origin state dict of Model and Optimizer'''
torch.save(model.state_dict(), 'origin_sd.pth')
origin_sd = torch.load('origin_sd.pth')


Succesfully load pre-trained weights


In [7]:
optimizer = AdamW(model.parameters(), lr=3e-5, correct_bias=False)
_, optimizer = amp.initialize([], optimizer, opt_level='O2')

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


# Find Best Learning-Rate

In [4]:
# Training with K-fold
new_data = data
BATCH_SIZE = 7
EPOCH = 5
LEARNING_RATE = 1e-6

# Freeze BERT
model.freeze()

""" TRAINING """
dataset = BonzDataset(data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)

''' STORING WHILE TRAINING'''
lr_list = []
loss_list = []

model.train()

for i in range(6):
    # Setup scheduler each period
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                                  base_lr=0, 
                                                  max_lr=LEARNING_RATE*10, 
                                                  step_size_up=EPOCH, 
                                                  cycle_momentum=False)
    scheduler.step()
    
    for epoch in tqdm.notebook.trange(EPOCH):

        # Load original weights
        model.load_state_dict(origin_sd) 
        loss_train = 0

        for idx, (a, b, c) in enumerate(dataloader):
            optimizer.zero_grad()
            predict, loss = model(a.to(DEVICE), 
                                  b.to(DEVICE), 
                                  c.to(DEVICE), 
                                  weight_loss=weight_loss)[:2]   # This is L-BERT
            loss.backward()
            loss_train += loss.item()
            optimizer.step()

        current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
        print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')

        # Store metrics
        lr_list.append(current_lr)
        loss_list.append(loss_train)

        # Update learning rate
        scheduler.step()
    
    LEARNING_RATE *= 10

    
# Print losss per learning rate
for a, b in zip(lr_list, loss_list):
    print(f'{a:.0e}\t{b:.2f}')



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 6244.56, Learning Rate = 2.00e-06
Epoch: 1, Loss = 5926.94, Learning Rate = 4.00e-06
Epoch: 2, Loss = 5830.10, Learning Rate = 6.00e-06
Epoch: 3, Loss = 5766.99, Learning Rate = 8.00e-06
Epoch: 4, Loss = 5761.07, Learning Rate = 1.00e-05



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 5547.02, Learning Rate = 2.00e-05
Epoch: 1, Loss = 5400.37, Learning Rate = 4.00e-05
Epoch: 2, Loss = 5379.64, Learning Rate = 6.00e-05
Epoch: 3, Loss = 5304.28, Learning Rate = 8.00e-05
Epoch: 4, Loss = 5252.24, Learning Rate = 1.00e-04



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 5216.68, Learning Rate = 2.00e-04
Epoch: 1, Loss = 5138.04, Learning Rate = 4.00e-04
Epoch: 2, Loss = 5089.22, Learning Rate = 6.00e-04
Epoch: 3, Loss = 5098.34, Learning Rate = 8.00e-04
Epoch: 4, Loss = 4983.20, Learning Rate = 1.00e-03



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 5140.37, Learning Rate = 2.00e-03
Epoch: 1, Loss = 5198.16, Learning Rate = 4.00e-03
Epoch: 2, Loss = 5744.86, Learning Rate = 6.00e-03
Epoch: 3, Loss = 5603.75, Learning Rate = 8.00e-03
Epoch: 4, Loss = 5859.97, Learning Rate = 1.00e-02



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 7149.28, Learning Rate = 2.00e-02
Epoch: 1, Loss = 9230.91, Learning Rate = 4.00e-02
Epoch: 2, Loss = 11248.39, Learning Rate = 6.00e-02
Epoch: 3, Loss = 16239.48, Learning Rate = 8.00e-02
Epoch: 4, Loss = 21634.04, Learning Rate = 1.00e-01



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 38416.24, Learning Rate = 2.00e-01
Epoch: 1, Loss = 84251.29, Learning Rate = 4.00e-01
Epoch: 2, Loss = 121610.12, Learning Rate = 6.00e-01
Epoch: 3, Loss = 165575.69, Learning Rate = 8.00e-01
Epoch: 4, Loss = 229921.73, Learning Rate = 1.00e+00

2e-06	6244.56
4e-06	5926.94
6e-06	5830.10
8e-06	5766.99
1e-05	5761.07
2e-05	5547.02
4e-05	5400.37
6e-05	5379.64
8e-05	5304.28
1e-04	5252.24
2e-04	5216.68
4e-04	5138.04
6e-04	5089.22
8e-04	5098.34
1e-03	4983.20
2e-03	5140.37
4e-03	5198.16
6e-03	5744.86
8e-03	5603.75
1e-02	5859.97
2e-02	7149.28
4e-02	9230.91
6e-02	11248.39
8e-02	16239.48
1e-01	21634.04
2e-01	38416.24
4e-01	84251.29
6e-01	121610.12
8e-01	165575.69
1e+00	229921.73


## Training with K-FOLD

In [28]:
# Training with K-fold
#new_data = data.sample(frac=1).reset_index(drop=True)
new_data = data
kf = KFold(10)
BATCH_SIZE = 7
EPOCH = 5
LEARNING_RATE = 1e-5

NUM_TRAINING_STEPS = 73 * 5
NUM_TRAINING_STEPS//10

last_predict = []
i = 0
for train_idx, test_idx in tqdm.notebook.tqdm(kf.split(new_data)):
    train_data = new_data.iloc[train_idx]
    test_data = new_data.iloc[test_idx]
    
    print(model.load_state_dict(origin_sd))
    
    ''' Get optimizer for each KFold
    optimizer = optimizer_list[i]
    optimizer_list[i] = ''
    i += 1
    '''
    '''
    NUM_TRAINING_STEPS = (new_data.shape[0]//BATCH_SIZE + 1) * EPOCH
    scheduler = get_linear_schedule_with_warmup(optimizer, NUM_TRAINING_STEPS//10, NUM_TRAINING_STEPS)
    '''

    """ TRAINING """
    dataset = BonzDataset(train_data.iloc[:,-3:], None)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
    model.train()
    for epoch in tqdm.notebook.trange(EPOCH):
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
        loss_train = 0
        for idx, (a, b, c) in enumerate(dataloader):
            optimizer.zero_grad()
            predict, loss = model(a.to(DEVICE), 
                                  b.to(DEVICE), 
                                  c.to(DEVICE), 
                                  weight_loss=weight_loss)[:2]   # This is L-BERT
            #predict, loss = model(a.to(DEVICE), None, c.to(DEVICE))[:2]   # This is normal BERT

            ''' Using apex fp16 loss 
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            '''

            ''' normal loss '''
            loss.backward()

            #nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            #scheduler.step()

            loss_train += loss.item()

        print(f'Epoch: {epoch}, Loss = {loss_train:.2f}')
                
        
    ''' TESTING  ''' 
    model.eval()
    dataset = BonzDataset(test_data.iloc[:,-3:], None)
    dataloader = DataLoader(dataset, batch_size=40)

    for idx, (a, b, c) in enumerate(dataloader):
        with torch.no_grad():
            predict = model(a.to(DEVICE), b.to(DEVICE))[0] # This is L-BERT
            #predict = model(a.to(DEVICE), None)[0] # This is normal BERT
        last_predict.extend(predict.detach().cpu().numpy().tolist())


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4423.82
Epoch: 1, Loss = 3950.26
Epoch: 2, Loss = 3725.88
Epoch: 3, Loss = 3474.10
Epoch: 4, Loss = 3177.60

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4504.44
Epoch: 1, Loss = 4183.29
Epoch: 2, Loss = 3992.96
Epoch: 3, Loss = 3821.90
Epoch: 4, Loss = 3661.60

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4355.87
Epoch: 1, Loss = 3910.19
Epoch: 2, Loss = 3676.94
Epoch: 3, Loss = 3424.62
Epoch: 4, Loss = 3136.30

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4321.13
Epoch: 1, Loss = 3876.30
Epoch: 2, Loss = 3656.44
Epoch: 3, Loss = 3391.58
Epoch: 4, Loss = 3085.10

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4403.65
Epoch: 1, Loss = 3983.06
Epoch: 2, Loss = 3770.46
Epoch: 3, Loss = 3549.74
Epoch: 4, Loss = 3304.46

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4430.94
Epoch: 1, Loss = 3881.30
Epoch: 2, Loss = 3646.53
Epoch: 3, Loss = 3418.56
Epoch: 4, Loss = 3144.37

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4310.70
Epoch: 1, Loss = 3850.37
Epoch: 2, Loss = 3602.55
Epoch: 3, Loss = 3324.71
Epoch: 4, Loss = 3024.26

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4700.06
Epoch: 1, Loss = 4238.62
Epoch: 2, Loss = 4029.17
Epoch: 3, Loss = 3881.45
Epoch: 4, Loss = 3719.69

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4347.60
Epoch: 1, Loss = 3920.74
Epoch: 2, Loss = 3693.45
Epoch: 3, Loss = 3463.86
Epoch: 4, Loss = 3192.02

<All keys matched successfully>


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 4513.03
Epoch: 1, Loss = 3919.64
Epoch: 2, Loss = 3674.30
Epoch: 3, Loss = 3408.24
Epoch: 4, Loss = 3103.42




# EVALUATION STEP

In [57]:
"""
#sd = torch.load('./saved_state_dict/epoch5lr2e5.pth')
#model.load_state_dict(sd)
model.eval()

dataset = BonzDataset(new_data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=40)

last_predict = []
for idx, (a, b, c) in enumerate(tqdm.notebook.tqdm(dataloader)):
    with torch.no_grad():
        predict = model(a.to(DEVICE), b.to(DEVICE))[0]
    last_predict.extend(predict.detach().cpu().numpy().tolist())
"""

HBox(children=(FloatProgress(value=0.0, max=3528.0), HTML(value='')))




In [29]:
last_predict_ = torch.tensor(last_predict)
last_predict_ = torch.softmax(last_predict_, 1)
y_predict = torch.argmax(last_predict_, 1)
y_true = np.asarray(list(new_data.labels))

for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\n{classification_report(y_true[:,i], y_predict[:,i])}')
    
for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}, {precision_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {recall_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {f1_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {accuracy_score(y_true[:,i], y_predict[:,i])*100:.2f}')

LEG:
              precision    recall  f1-score   support

           0       0.57      0.46      0.51     10949
           1       0.82      0.88      0.85     31161
           2       0.27      0.21      0.23       757

    accuracy                           0.76     42867
   macro avg       0.56      0.52      0.53     42867
weighted avg       0.75      0.76      0.75     42867

SIT:
              precision    recall  f1-score   support

           0       0.64      0.53      0.58     11434
           1       0.84      0.89      0.86     30804
           2       0.26      0.22      0.24       629

    accuracy                           0.79     42867
   macro avg       0.58      0.55      0.56     42867
weighted avg       0.78      0.79      0.78     42867

ENT:
              precision    recall  f1-score   support

           0       0.55      0.49      0.52     13359
           1       0.78      0.81      0.79     24475
           2       0.36      0.40      0.38      5033

    a

In [30]:
torch.save(y_predict, 'result/LCAT_5epoch_1e5_WeightClass_xavier.pt')