In [21]:
# https://medium.com/udacity-pytorch-challengers/ideas-on-how-to-fine-tune-a-pre-trained-model-in-pytorch-184c47185a20
# https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee

import re
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tqdm
import math
import ast
import nltk

from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from apex import amp, optimizers

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

COL_NAMES = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
             'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
             'AirlineNation', 'CrawlTime']

PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ASPECT_NAMES = ['LEG', 'SIT', 'ENT', 'CUS', 'VOM', 'CLE', 'CKI', 'FNB']
VOCAB_DIC = BertTokenizerFast.from_pretrained(PRE_TRAINED).get_vocab()
TOPN = 150


# This one is implemented with weight loss per class            
class BertBonzWeightLoss(BertModel):
    def __init__(self, config):
        super(BertBonzWeightLoss, self).__init__(config)
        self.config = config
        self.embeddings.llr_embeddings = nn.ModuleList(nn.Embedding(4, 768, 3) for _ in range(len(ASPECT_NAMES)))
        self.classifier = nn.Linear(768, config.num_aspect*3)
        self.init_weights()
        self.embeddings.llr_embeddings.apply(self._xavier)
        self.pooler.apply(self._xavier)
        self.classifier.apply(self._xavier)
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None,
                weight_loss=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        
        if llr_ids is not None:
            temp = [self.embeddings.llr_embeddings[i](llr_ids[:,i,:]) for i in range(self.config.num_aspect)]
            llr_embeddings = sum(temp)
        else:
            llr_embeddings = torch.zeros(inputs_embeds.size(), device=device).fill_(3).long()
        
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.classifier(CLS_token)
        
        loss_fn = nn.functional.cross_entropy
        if labels is not None:
            if weight_loss is None:
                loss = loss_fn(predict.view(input_shape[0], 3,-1), labels)
            else:
                loss = torch.tensor(0).float().to(DEVICE)
                for asp_i in range(len(ASPECT_NAMES)):
                    loss += loss_fn(predict.view(input_shape[0], 3,-1)[:,:,asp_i], labels[:,asp_i], weight_loss[asp_i, :])
                loss /= len(ASPECT_NAMES)
                    
            outputs = (predict.view(input_shape[0], 3,-1), loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict.view(input_shape[0], 3,-1), CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
    def llr_embed_pad(self):
        for i in range(len(self.embeddings.llr_embeddings)):
            temp = self.embeddings.llr_embeddings[i].weight.data
            temp[-1,:] = torch.zeros(temp.size(1))
        
    def _xavier(self, module):
        for name, param in module.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
            elif 'bias' in name:
                param.data.zero_()
                
    def unfreeze(self):
        for param in self.parameters():
            param.requires_grad = True
                
    def freeze(self):
        for param in self.parameters():
            param.requires_grad = False
        for param in self.embeddings.llr_embeddings.parameters():
            param.requires_grad = True
        for param in self.pooler.parameters():
            param.requires_grad = True
        for param in self.classifier.parameters():
            param.requires_grad = True            
    

class BonzDataset(Dataset):
    def __init__(self, data, llr_words):
        self.input_ids = torch.LongTensor(list(data.input_ids))
        self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        if 'llr_embeddings' in data.columns:
            self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        else:
            self.llr_embeddings = torch.zeros(data.shape[0],1).long()
        if 'labels' in data.columns:
            self.labels = torch.LongTensor(list(data.labels))
        else:
            self.labels = None
        self.llr_words = llr_words
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        '''
        tokens = self.data.input_ids[idx]
        
        llr_embedding = []
        for aspect in ASPECT_NAMES:
            temp = [3] * tokens.shape[0]
            for j in range(tokens.shape[0]):
                for class_, wordlist in llr_words[aspect].items():
                    if tokens[j] in wordlist:
                        temp[j] = class_
                        break
            llr_embedding.append(temp)
        
        llr_embedding = torch.stack([torch.LongTensor(i) for i in llr_embedding], 0)
        
        
        outputs = (torch.LongTensor(tokens), llr_embedding)
        
        if 'labels' in self.data.columns:
            outputs = (torch.LongTensor(tokens), llr_embedding, torch.LongTensor(self.data.labels[idx]))
        '''
        if self.labels is None:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx])
        else:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx], self.labels[idx])
        
        return outputs
    

    
def split_aspect(data):
    temp = np.full((8, data.shape[0]), 2, np.int)
    for idx in range(data.shape[0]):
        aspect = data[idx]
        for i, asp in enumerate(['Legroom', 'Seat', 'Entertainment', 'Customer', 'Value', 'Cleanliness', 'Check-in', 'Food']):
            for sub_asp in aspect:
                if asp in sub_asp:
                    pol = int(sub_asp[-1])
                    temp[i, idx] = 1 if pol > 3 else 0
                    break
    return temp
            

def tokenize_data(data):
    tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
    input_ids = tokenizer(list(data))['input_ids']
    input_ids = pad_sequences(input_ids, maxlen=512, padding='post', truncating='post')
    
    return (list(input_ids), tokenizer)
    
    
def get_data(FILE_PATH, COL_NAMES):
    raw_data = pd.read_csv(FILE_PATH, sep='\t', header=None, names=COL_NAMES)
    data = raw_data[['ReviewText', 'Rating', 'Aspects']]
    data = data[data['Aspects'] != 'No filling in'] # Filter none aspects
    data.Aspects = data.Aspects.str.split('|').values
    
    '''Split aspects to new columns'''
    aspects_splitted = split_aspect(data.Aspects.values)
    for i in range(len(ASPECT_NAMES)):
        data[ASPECT_NAMES[i]] = aspects_splitted[i,:]
        
    data['input_ids'], tokenizer = tokenize_data(data.ReviewText.values) # Generate input_ids from review text
    
    return data, tokenizer


def word_class_freq(data, aspect_name, aspect_class=3):
    temp = np.zeros((33000, aspect_class), np.int)
    ids = data.input_ids.values
    labels = data[aspect_name].values

    for sub_ids, sub_lb in zip(ids, labels):
        set_ids = set(sub_ids)
        for ids in set_ids:
            temp[ids, sub_lb] += 1
    
    return temp


def calculate_llr(temp_df, labels):
    N = data.shape[0]
    total_scores = []

    for i in temp_df.index.values:
        llr_scores = []
        for class_ in [0,1,2]:
            num_class_doc = np.sum(labels == class_)
            n11 = temp_df.loc[i, class_]
            n10 = num_class_doc - n11
            n01 = temp_df.loc[i, 'total'] - n11
            n00 = (N - n11 - n10 - n01)
            pt = (1e-10 + n11 + n01)/N
            p1 = n11/(1e-10 + n11 + n10)
            p2 = n01/(1e-10 + n01 + n00)


            try:
                e1 = n11 * (math.log(pt) - math.log(p1))
            except:
                e1 = 0
            try:
                e2 = n10 * (math.log(1-pt) - math.log(1-p1))
            except:
                e2 = 0
            try:
                e3 = n01 * (math.log(pt) - math.log(p2))
            except:
                e3 = 0
            try:
                e4 = n00 * (math.log(1-pt) - math.log(1-p2))
            except:
                e4 = 0

            llr_score = -2 * (e1+e2+e3+e4)
            if n11 < n01:
                llr_score = 0
            llr_scores.append(llr_score)

        total_scores.append(llr_scores)
    
    llr_df = pd.DataFrame(np.array(total_scores), index=temp_df.index, columns=temp_df.columns.values[:-1])

    return llr_df


def generate_llr_score(data, aspect):
    temp = word_class_freq(data, aspect)
    
    temp_df = pd.DataFrame(temp)
    temp_df['total'] = np.sum(temp, -1)
    temp_df = temp_df[temp_df['total'] != 0]
    temp_df = temp_df.drop(0,0)
    
    return calculate_llr(temp_df, data[aspect].values)

# LOAD PRE-PROCESSED DATA (IF ANY)

In [2]:
# Load n process read data
data = pd.read_csv('./data/pre-processed_150_v3.csv', sep='\t', index_col=0)

for col in tqdm.notebook.tqdm(['input_ids', 'labels', 'llr_embeddings']):
    data[col] = [ast.literal_eval(i) for i in tqdm.notebook.tqdm(data[col].values)]


# CALCULATE WEIGHT LOSS
labels = pd.DataFrame([i for i in data.labels])
beta = 0.9999
#beta = (data.shape[0] - 1) / data.shape[0]
weight_loss = []

for i in labels:
    n_sample = labels.loc[:, i].value_counts(0, 0).values
    n_sample = 1.0 - np.power(beta, n_sample)
    n_sample = (1.0 - beta) / n_sample
    n_sample = n_sample / np.sum(n_sample)
    weight_loss.append(n_sample)

weight_loss = torch.tensor(weight_loss, device=DEVICE).float()
weight_loss

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=152574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=152574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=152574.0), HTML(value='')))





tensor([[0.1314, 0.1293, 0.7393],
        [0.1193, 0.1175, 0.7633],
        [0.3078, 0.3041, 0.3881],
        [0.1410, 0.1291, 0.7299],
        [0.2203, 0.2106, 0.5691],
        [0.3805, 0.3023, 0.3173],
        [0.3706, 0.3069, 0.3225],
        [0.3339, 0.3270, 0.3391]], device='cuda:0')

In [3]:
new_data = {'index': data.index}
for col in data.columns:
    new_data[col] = torch.tensor(list(data[col]))


torch.save(new_data, './data/pre-processed_150_v3.pt')

In [80]:
a = torch.load('./data/pre-processed_150_v3.pt')
for col in a:
    a[col] = a[col].tolist()

data = pd.DataFrame(a, a['index']).drop('index', 1)

# CALCULATE WEIGHT LOSS
labels = pd.DataFrame([i for i in data.labels])
beta = 0.9999
#beta = (data.shape[0] - 1) / data.shape[0]
weight_loss = []

for i in labels:
    n_sample = labels.loc[:, i].value_counts(0, 0).values
    n_sample = 1.0 - np.power(beta, n_sample)
    n_sample = (1.0 - beta) / n_sample
    n_sample = n_sample / np.sum(n_sample)
    weight_loss.append(n_sample)

weight_loss = torch.tensor(weight_loss, device=DEVICE).float()


data

Unnamed: 0,input_ids,labels,llr_embeddings
0,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1,..."
1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
2,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
3,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 1, 1, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,..."
4,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875...","[1, 1, 1, 0, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
...,...,...,...
190981,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105...","[0, 0, 0, 0, 0, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3,..."
190982,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 3, 3, 3, 1, 1, 3, 1, 1, 3, 3, 3, 3, 1, 3,..."
190983,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 1, 3, 1, 1,..."
190984,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219...","[0, 1, 1, 0, 0, 2, 2, 2]","[[3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,..."


# GET DATA

In [35]:
data, tokenizer = get_data('./data/data_v3.txt', COL_NAMES)
data['labels'] = list(data.iloc[:, 3:11].values)

# CALCULATE WEIGHT LOSS
labels = pd.DataFrame([i for i in data.labels])
beta = 0.9999
#beta = (data.shape[0] - 1) / data.shape[0]
weight_loss = []

for i in labels:
    n_sample = labels.loc[:, i].value_counts(0, 0).values
    n_sample = 1.0 - np.power(beta, n_sample)
    n_sample = (1.0 - beta) / n_sample
    n_sample = n_sample / np.sum(n_sample)
    weight_loss.append(n_sample)

weight_loss = torch.tensor(weight_loss, device=DEVICE).float()

data

(190986, 16)
(152574, 3)


Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels
0,With everyone trying to get home in the Covid ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999...","[1, 1, 1, 1, 1, 1, 1, 1]"
1,"Ad a lot of people did, we had to scramble to ...",5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]"
2,After coming into Changi airport and worrying ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199...","[1, 1, 1, 1, 1, 1, 1, 1]"
3,"Great service, great plane, great pricing. We ...",4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307...","[1, 1, 1, 1, 1, 1, 1, 1]"
4,My husband and I were to fly home from Houston...,1,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875...","[1, 1, 1, 0, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190981,We booked to fly from Heathrow to Newark. The ...,1,"[Legroom:2, Seat comfort:2, In-flight Entertai...",0,0,0,0,0,2,2,2,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105...","[0, 0, 0, 0, 0, 2, 2, 2]"
190982,"Love Virgin, great staff, food good, quality o...",5,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833...","[1, 1, 1, 1, 1, 2, 2, 2]"
190983,"Virgin upper class is outstanding, really very...",5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428...","[1, 1, 1, 1, 1, 2, 2, 2]"
190984,Virgins premium economy is the best I have com...,5,"[Legroom:3, Seat comfort:5, In-flight Entertai...",0,1,1,0,0,2,2,2,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219...","[0, 1, 1, 0, 0, 2, 2, 2]"


# CALCULATE LLR SCORES & WORDLIST

In [5]:
# Stopwords in English
stopwords_ids = tokenizer.convert_tokens_to_ids(stopwords.words('english'))

llr_scores = {}

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Calculate LLR scores'):
    llr_df = generate_llr_score(data, aspect)
    
    # Clear stopword ids
    llr_df = llr_df.drop(stopwords_ids, 0)
    
    llr_scores[aspect] = llr_df


llr_words = dict()

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Generate top LLR words'):
    kw_label = dict()
    for class_ in [0,1,2]:
        # Sort keywords based on aspect, class and top_n words
        kw_list = list(llr_scores[aspect][class_].sort_values(ascending=False)[:TOPN].index)
        
        kw_label[class_] = kw_list
        
    llr_words[aspect] = kw_label

llr_embedding_list = []

for idx in tqdm.notebook.tqdm(data.index):
    tokens = data.input_ids[idx]
    
    llr_embedding = []
    for aspect in ASPECT_NAMES:
        temp = [3] * tokens.shape[0]
        for j in range(tokens.shape[0]):
            for class_, wordlist in llr_words[aspect].items():
                if tokens[j] in wordlist:
                    temp[j] = class_
                    break
        llr_embedding.append(temp)
    
    llr_embedding_list.append(llr_embedding)

#data['llr_embeddings'] = [[[0]*512]*8] * data.shape[0]
data['llr_embeddings'] = llr_embedding_list

# Turn numpy array to list to store easier
for i in data.keys()[-3:]:
    data[i] = data[i].map(list)
    
data

HBox(children=(FloatProgress(value=0.0, description='Calculate LLR scores', max=8.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Generate top LLR words', max=8.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=152574.0), HTML(value='')))




Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels,llr_embeddings
0,With everyone trying to get home in the Covid ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1,..."
1,"Ad a lot of people did, we had to scramble to ...",5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
2,After coming into Changi airport and worrying ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
3,"Great service, great plane, great pricing. We ...",4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 1, 1, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,..."
4,My husband and I were to fly home from Houston...,1,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875...","[1, 1, 1, 0, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190981,We booked to fly from Heathrow to Newark. The ...,1,"[Legroom:2, Seat comfort:2, In-flight Entertai...",0,0,0,0,0,2,2,2,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105...","[0, 0, 0, 0, 0, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3,..."
190982,"Love Virgin, great staff, food good, quality o...",5,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 3, 3, 3, 1, 1, 3, 1, 1, 3, 3, 3, 3, 1, 3,..."
190983,"Virgin upper class is outstanding, really very...",5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 3, 3, 1, 3, 1, 3, 3, 3, 1, 3, 1, 3, 1, 1,..."
190984,Virgins premium economy is the best I have com...,5,"[Legroom:3, Seat comfort:5, In-flight Entertai...",0,1,1,0,0,2,2,2,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219...","[0, 1, 1, 0, 0, 2, 2, 2]","[[3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,..."


In [6]:
data.iloc[:,-3:].to_csv('./data/pre-processed_150_v3.csv', sep='\t')
data.iloc[:1000,-3:].to_csv('./data/sample_150_v3.csv', sep='\t')

# INITATE MODEL

In [4]:
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_aspect = len(ASPECT_NAMES)
model = BertBonzWeightLoss(config)
model.to(DEVICE)

model.load_pretrained_weight() # Load pre-trained BERT weights for BERT's layers 
model.llr_embed_pad() # Set LLR embedding padding idx to 0-value tensor



''' Using apex for faster training
optimizer_list = []
for i in range(10):
    optimizer_list.append(AdamW(model.parameters(), lr=3e-5, correct_bias=False))

model = amp.initialize(model, opt_level="O2", verbosity=0)
''' 

''' Save origin state dict of Model and Optimizer'''
#torch.save(model.state_dict(), 'origin_sd.pth')
origin_sd = torch.load('./state_dict/freeze.pth')


Succesfully load pre-trained weights


# Find Best Learning-Rate

In [32]:
new_data = data
BATCH_SIZE = 7
EPOCH = 3
LEARNING_RATE = 1e-7
STEP = 100
SKIP = 1
smooth = 0.05

# Freeze BERT
model.unfreeze()

""" TRAINING """
dataset = BonzDataset(data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)

''' STORING WHILE TRAINING'''
lr_list = []
loss_list = []

model.train()

'''
for i in range(3):
    # Setup scheduler each period
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                                  base_lr=0, 
                                                  max_lr=LEARNING_RATE*10, 
                                                  step_size_up=EPOCH, 
                                                  cycle_momentum=False)
    scheduler.step()
    
    for epoch in tqdm.notebook.trange(EPOCH):

        # Load original weights
        model.load_state_dict(origin_sd) 
        loss_train = 0

        for idx, (a, b, c) in enumerate(dataloader):
            optimizer.zero_grad()
            predict, loss = model(a.to(DEVICE), 
                                  b.to(DEVICE), 
                                  c.to(DEVICE), 
                                  weight_loss=weight_loss)[:2]   # This is L-BERT
            loss.backward()
            loss_train += loss.item()
            optimizer.step()

        current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
        print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')

        # Store metrics
        lr_list.append(current_lr)
        loss_list.append(loss_train)

        # Update learning rate
        scheduler.step()
    
    LEARNING_RATE *= 10
'''

'''
lr_lambda = lambda x: math.exp(x * math.log(LEARNING_RATE*1e4 / LEARNING_RATE) / (STEP-1))
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

for epoch in tqdm.notebook.trange(EPOCH*4):
    # Load original weights
    loss_train = 0

    for idx, (a, b, c) in enumerate(dataloader):
        optimizer.zero_grad()
        predict, loss = model(a.to(DEVICE), 
                              b.to(DEVICE), 
                              c.to(DEVICE), 
                              weight_loss=weight_loss)[:2]   # This is L-BERT
        loss.backward()
        loss_train += loss.item()
        optimizer.step()

    current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
    print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')

    # Store metrics
    lr_list.append(current_lr)
    loss_list.append(loss_train)

    # Update learning rate
    scheduler.step()
'''

model.load_state_dict(torch.load('freeze.pth')) 

lr_lambda = lambda x: math.exp(x * math.log(LEARNING_RATE*1e3 / LEARNING_RATE) / (STEP))
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

for idx, (a, b, c) in enumerate(dataloader):
    optimizer.zero_grad()
    predict, loss = model(a.to(DEVICE), 
                          b.to(DEVICE), 
                          c.to(DEVICE), 
                          weight_loss=weight_loss)[:2]   # This is L-BERT
    loss.backward()
    optimizer.step()

    
    # Update learning rate
    if idx >= SKIP:
        scheduler.step()
        
        current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
        lr_list.append(current_lr)
        
        if idx == SKIP:
            loss_list.append(loss.item())
        elif idx > SKIP:
            temp = loss.item()*smooth + (1-smooth)*loss_list[-1]
            loss_list.append(temp)
    
        # Stop when reach the step
        if idx >= (STEP+SKIP):
            break
    
# Print losss per learning rate
for a, b in zip(lr_list, loss_list):
    print(f'{a:.2e},{b:.4f}')

1.07e-07,0.8326
1.15e-07,0.8241
1.23e-07,0.8335
1.32e-07,0.8196
1.41e-07,0.8121
1.51e-07,0.7947
1.62e-07,0.7743
1.74e-07,0.7625
1.86e-07,0.7493
2.00e-07,0.7390
2.14e-07,0.7329
2.29e-07,0.7304
2.45e-07,0.7064
2.63e-07,0.6962
2.82e-07,0.6835
3.02e-07,0.6786
3.24e-07,0.6720
3.47e-07,0.6671
3.72e-07,0.6766
3.98e-07,0.6653
4.27e-07,0.6769
4.57e-07,0.6708
4.90e-07,0.6823
5.25e-07,0.6745
5.62e-07,0.6962
6.03e-07,0.6905
6.46e-07,0.6836
6.92e-07,0.6770
7.41e-07,0.6663
7.94e-07,0.6585
8.51e-07,0.6620
9.12e-07,0.6713
9.77e-07,0.6643
1.05e-06,0.6492
1.12e-06,0.6377
1.20e-06,0.6426
1.29e-06,0.6377
1.38e-06,0.6262
1.48e-06,0.6246
1.58e-06,0.6175
1.70e-06,0.6237
1.82e-06,0.6112
1.95e-06,0.6040
2.09e-06,0.5995
2.24e-06,0.5996
2.40e-06,0.6003
2.57e-06,0.6070
2.75e-06,0.6068
2.95e-06,0.6097
3.16e-06,0.6106
3.39e-06,0.6096
3.63e-06,0.6079
3.89e-06,0.6146
4.17e-06,0.6151
4.47e-06,0.6133
4.79e-06,0.6287
5.13e-06,0.6245
5.50e-06,0.6202
5.89e-06,0.6276
6.31e-06,0.6497
6.76e-06,0.6444
7.24e-06,0.6563
7.76e-06

# Training with FREEZE BERT

In [28]:
BATCH_SIZE = 7
EPOCH = 9
BASE_LR = 5e-5
MAX_LR = 5e-4
CYCLE = 3

# Freeze BERT
model.freeze()

""" TRAINING """
dataset = BonzDataset(data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model.load_state_dict(torch.load('origin_sd.pth'))
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=BASE_LR)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              base_lr=BASE_LR, 
                                              max_lr=MAX_LR, 
                                              step_size_up=(dataset.__len__()*EPOCH/BATCH_SIZE) // (CYCLE*2), 
                                              cycle_momentum=False)

model.train()

for epoch in tqdm.notebook.trange(EPOCH):
    # Load original weights
    loss_train = 0

    for idx, (a, b, c) in enumerate(dataloader):
        optimizer.zero_grad()
        predict, loss = model(a.to(DEVICE), 
                              b.to(DEVICE), 
                              c.to(DEVICE), 
                              weight_loss=weight_loss)[:2]   # This is L-BERT
        loss.backward()
        loss_train += loss.item()
        optimizer.step()
        scheduler.step()

    current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
    print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')

# SAVE AND LOAD PRE-TRAINED LLR EMBEDDING
torch.save(model.state_dict(), 'freeze.pth')
origin_sd = torch.load('freeze.pth')

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

Epoch: 0, Loss = 13939.86, Learning Rate = 3.50e-04
Epoch: 1, Loss = 14089.19, Learning Rate = 3.50e-04
Epoch: 2, Loss = 13919.82, Learning Rate = 5.00e-05
Epoch: 3, Loss = 13891.37, Learning Rate = 3.50e-04
Epoch: 4, Loss = 14064.57, Learning Rate = 3.50e-04
Epoch: 5, Loss = 13918.16, Learning Rate = 5.01e-05
Epoch: 6, Loss = 13869.50, Learning Rate = 3.50e-04
Epoch: 7, Loss = 14059.37, Learning Rate = 3.50e-04
Epoch: 8, Loss = 13893.49, Learning Rate = 5.01e-05



## Training with K-FOLD and UNFREEZE BERT

In [83]:
# Training with K-fold
new_data = data.sample(frac=1).reset_index(drop=True)
#new_data = data
kf = KFold(10)
BATCH_SIZE = 7
EPOCH = 3
BASE_LR = 2e-5
MAX_LR = 5e-5
CYCLE = 2

# Unfreeze BERT
model.unfreeze()

last_predict = []
i = 0
for train_idx, test_idx in tqdm.notebook.tqdm(kf.split(new_data)):
    train_data = new_data.iloc[train_idx]
    test_data = new_data.iloc[test_idx]
    
    model.load_state_dict(torch.load('./state_dict/freeze.pth'))
    """ TRAINING """
    dataset = BonzDataset(train_data.iloc[:,-3:], None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR)
    '''
    try:
        scheduler.load_state_dict(torch.load('./state_dict/scheduler_sd.pth'))
        print('Load scheduler')
    except:
        scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                                      base_lr=BASE_LR, 
                                                      max_lr=MAX_LR, 
                                                      step_size_up=(dataset.__len__()*EPOCH/BATCH_SIZE) // (CYCLE*2), 
                                                      cycle_momentum=False)
        torch.save(scheduler.state_dict(), 'scheduler_sd.pth')
        print('Create scheduler')
    
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                                  base_lr=BASE_LR, 
                                                  max_lr=MAX_LR, 
                                                  step_size_up=(dataset.__len__()*EPOCH/BATCH_SIZE) // (CYCLE*2), 
                                                  cycle_momentum=False)
    print(torch.cuda.memory_summary())   
    time.sleep(2)
    '''
    
    model.train()
    for epoch in tqdm.notebook.trange(EPOCH):
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
        loss_train = 0
        for a, b, c in dataloader:
            optimizer.zero_grad()
            predict, loss = model(a.to(DEVICE), 
                                  b.to(DEVICE), 
                                  c.to(DEVICE), 
                                  weight_loss=weight_loss)[:2]   # This is L-BERT
            #predict, loss = model(a.to(DEVICE), None, c.to(DEVICE))[:2]   # This is normal BERT

            loss.backward()

            optimizer.step()
            #scheduler.step()

            loss_train += loss.item()
        
        current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
        print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')
                
        
    ''' TESTING  ''' 
    model.eval()
    dataset = BonzDataset(test_data.iloc[:,-3:], None)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

    for a, b, c in dataloader:
        with torch.no_grad():
            predict = model(a.to(DEVICE), b.to(DEVICE))[0] # This is L-BERT
            #predict = model(a.to(DEVICE), None)[0] # This is normal BERT
        last_predict.extend(predict.detach().cpu().numpy().tolist())
        


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12482.23, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11694.65, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11016.31, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12519.23, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11700.66, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11071.92, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12501.33, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11718.18, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11046.63, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12502.21, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11727.24, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11093.66, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12600.18, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11795.26, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11170.70, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12516.23, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11736.89, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11077.52, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12523.42, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11768.38, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11116.14, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12474.36, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11656.35, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11009.30, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12476.58, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11639.35, Learning Rate = 2.00e-05
Epoch: 2, Loss = 10973.73, Learning Rate = 2.00e-05



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch: 0, Loss = 12481.16, Learning Rate = 2.00e-05
Epoch: 1, Loss = 11704.76, Learning Rate = 2.00e-05
Epoch: 2, Loss = 11032.38, Learning Rate = 2.00e-05




# TRAIN FULL DATA TO PREDICT

In [5]:
# Training with K-fold
#new_data = data.sample(frac=1).reset_index(drop=True)
new_data = data
BATCH_SIZE = 7
EPOCH = 5
BASE_LR = 2e-5

# Unfreeze BERT
model.unfreeze()

last_predict = []

model.load_state_dict(torch.load('./state_dict/freeze.pth'))
""" TRAINING """
dataset = BonzDataset(new_data.iloc[:,-3:], None)
optimizer = torch.optim.AdamW(model.parameters(), lr=BASE_LR)

model.train()
for epoch in tqdm.notebook.trange(EPOCH):
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    loss_train = 0
    for a, b, c in dataloader:
        optimizer.zero_grad()
        predict, loss = model(a.to(DEVICE), 
                              b.to(DEVICE), 
                              c.to(DEVICE), 
                              weight_loss=weight_loss)[:2]   # This is L-BERT
        #predict, loss = model(a.to(DEVICE), None, c.to(DEVICE))[:2]   # This is normal BERT

        loss.backward()

        optimizer.step()
        #scheduler.step()

        loss_train += loss.item()

    current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
    print(f'Epoch: {epoch}, Loss = {loss_train:.2f}, Learning Rate = {current_lr:.2e}')




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Epoch: 0, Loss = 13855.07, Learning Rate = 2.00e-05
Epoch: 1, Loss = 13024.75, Learning Rate = 2.00e-05
Epoch: 2, Loss = 12348.88, Learning Rate = 2.00e-05
Epoch: 3, Loss = 11501.65, Learning Rate = 2.00e-05
Epoch: 4, Loss = 10414.48, Learning Rate = 2.00e-05



In [84]:
last_predict_ = torch.tensor(last_predict)
last_predict_ = torch.softmax(last_predict_, 1)
y_predict = torch.argmax(last_predict_, 1)
y_true = np.asarray(list(new_data.labels))


for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\n{classification_report(y_true[:,i], y_predict[:,i])}')
    
for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}, {precision_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {recall_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {f1_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {accuracy_score(y_true[:,i], y_predict[:,i])*100:.2f}')

LEG:
              precision    recall  f1-score   support

           0       0.62      0.50      0.55     41143
           1       0.82      0.89      0.85    109509
           2       0.28      0.24      0.25      1922

    accuracy                           0.77    152574
   macro avg       0.57      0.54      0.55    152574
weighted avg       0.76      0.77      0.77    152574

SIT:
              precision    recall  f1-score   support

           0       0.67      0.53      0.59     41886
           1       0.83      0.90      0.87    109017
           2       0.28      0.27      0.27      1671

    accuracy                           0.79    152574
   macro avg       0.59      0.57      0.58    152574
weighted avg       0.78      0.79      0.78    152574

ENT:
              precision    recall  f1-score   support

           0       0.58      0.54      0.56     44245
           1       0.81      0.86      0.83     93024
           2       0.45      0.34      0.38     15305

    a

In [10]:
torch.save(y_predict, 'result/LCAT_5epoch_1e5_WeightClass_xavier.pt')

# PREDICT NO FILLING IN DATA

In [31]:
full_data = pd.read_csv('./data//data_v3.txt', sep='\t', header=None, names=COL_NAMES)
full_data = full_data[['AirlineName', 'ReviewDate', 'ReviewText', 'Aspects']]
full_data['Predicted'] = (full_data['Aspects'] == 'No filling in')
full_data.Aspects = full_data.Aspects.str.split('|').values

'''Split aspects to new columns'''
aspects_splitted = split_aspect(full_data.Aspects.values)
for i in range(len(ASPECT_NAMES)):
    full_data[ASPECT_NAMES[i]] = aspects_splitted[i,:]

full_data['input_ids'], tokenizer = tokenize_data(full_data.ReviewText.values)

full_data

Unnamed: 0,AirlineName,ReviewDate,ReviewText,Aspects,Predicted,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids
0,Singapore Airlines,Sep 2020,With everyone trying to get home in the Covid ...,"[Legroom:4, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,1,1,1,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999..."
1,Singapore Airlines,Sep 2020,"Ad a lot of people did, we had to scramble to ...","[Legroom:4, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,1,1,1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010..."
2,Singapore Airlines,Sep 2020,After coming into Changi airport and worrying ...,"[Legroom:4, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,1,1,1,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199..."
3,Singapore Airlines,Sep 2020,"Great service, great plane, great pricing. We ...","[Legroom:4, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307..."
4,Singapore Airlines,Sep 2020,My husband and I were to fly home from Houston...,"[Legroom:5, Seat comfort:5, In-flight Entertai...",False,1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190981,Virgin Atlantic Airways,Jan 2016,We booked to fly from Heathrow to Newark. The ...,"[Legroom:2, Seat comfort:2, In-flight Entertai...",False,0,0,0,0,0,2,2,2,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105..."
190982,Virgin Atlantic Airways,Jan 2016,"Love Virgin, great staff, food good, quality o...","[Legroom:5, Seat comfort:5, In-flight Entertai...",False,1,1,1,1,1,2,2,2,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833..."
190983,Virgin Atlantic Airways,Jan 2016,"Virgin upper class is outstanding, really very...","[Legroom:5, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,2,2,2,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428..."
190984,Virgin Atlantic Airways,Jan 2016,Virgins premium economy is the best I have com...,"[Legroom:3, Seat comfort:5, In-flight Entertai...",False,0,1,1,0,0,2,2,2,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219..."


In [33]:
predicted_data = full_data[full_data['Predicted'] == True]
predicted_data

Unnamed: 0,AirlineName,ReviewDate,ReviewText,Aspects,Predicted,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids
5,Singapore Airlines,Aug 2020,I cant rate Singspore Air any higher for their...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 1045, 2064, 2102, 3446, 10955, 26691, 22..."
6,Singapore Airlines,Aug 2020,They told me it only take 4-6 weeks for the am...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2027, 2409, 2033, 2009, 2069, 2202, 1018..."
7,Singapore Airlines,Aug 2020,"This is a lovely airline, the stewardesses in ...",[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2023, 2003, 1037, 8403, 8582, 1010, 1996..."
9,Singapore Airlines,Aug 2020,We were due to fly to Australia in October. We...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2057, 2020, 2349, 2000, 4875, 2000, 2660..."
12,Singapore Airlines,Aug 2020,Our flights were cancelled and 3 months later ...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2256, 7599, 2020, 8014, 1998, 1017, 2706..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190862,Virgin Atlantic Airways,Feb 2016,Boarding at the airport was rather disorganise...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 9405, 2012, 1996, 3199, 2001, 2738, 4487..."
190890,Virgin Atlantic Airways,Feb 2016,My wife and I recently booked a return flight ...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2026, 2564, 1998, 1045, 3728, 17414, 103..."
190895,Virgin Atlantic Airways,Feb 2016,Arriving at the Upper Class Wing to the lounge...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 7194, 2012, 1996, 3356, 2465, 3358, 2000..."
190949,Virgin Atlantic Airways,Feb 2016,We booked our flights for a family holiday for...,[No filling in],True,2,2,2,2,2,2,2,2,"[101, 2057, 17414, 2256, 7599, 2005, 1037, 215..."


In [36]:
# Stopwords in English
stopwords_ids = tokenizer.convert_tokens_to_ids(stopwords.words('english'))

llr_scores = {}

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Calculate LLR scores'):
    llr_df = generate_llr_score(data, aspect)
    
    # Clear stopword ids
    llr_df = llr_df.drop(stopwords_ids, 0)
    
    llr_scores[aspect] = llr_df


llr_words = dict()

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Generate top LLR words'):
    kw_label = dict()
    for class_ in [0,1,2]:
        # Sort keywords based on aspect, class and top_n words
        kw_list = list(llr_scores[aspect][class_].sort_values(ascending=False)[:TOPN].index)
        
        kw_label[class_] = kw_list
        
    llr_words[aspect] = kw_label

llr_embedding_list = []

for idx in tqdm.notebook.tqdm(predicted_data.index):
    tokens = predicted_data.input_ids[idx]
    
    llr_embedding = []
    for aspect in ASPECT_NAMES:
        temp = [3] * tokens.shape[0]
        for j in range(tokens.shape[0]):
            for class_, wordlist in llr_words[aspect].items():
                if tokens[j] in wordlist:
                    temp[j] = class_
                    break
        llr_embedding.append(temp)
    
    llr_embedding_list.append(llr_embedding)

predicted_data['llr_embeddings'] = llr_embedding_list

HBox(children=(FloatProgress(value=0.0, description='Calculate LLR scores', max=8.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Generate top LLR words', max=8.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, max=38412.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
model.eval()
dataset = BonzDataset(predicted_data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

last_predict = []

for a, b in dataloader:
    with torch.no_grad():
        predict = model(a.to(DEVICE), b.to(DEVICE))[0] # This is L-BERT
        #predict = model(a.to(DEVICE), None)[0] # This is normal BERT
    last_predict.extend(predict.detach().cpu().numpy().tolist())

In [55]:
pred = torch.tensor(last_predict)
pred = torch.softmax(pred, 1)
pred = torch.argmax(pred, 1)
pred.shape

torch.Size([38412, 8])

In [70]:
for idx, asp in enumerate(ASPECT_NAMES):
    full_data.loc[predicted_data.index, asp] = pred[:,idx].tolist()

In [78]:
full_data.loc[3:7]

Unnamed: 0,AirlineName,ReviewDate,ReviewText,Aspects,Predicted,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids
3,Singapore Airlines,Sep 2020,"Great service, great plane, great pricing. We ...","[Legroom:4, Seat comfort:4, In-flight Entertai...",False,1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307..."
4,Singapore Airlines,Sep 2020,My husband and I were to fly home from Houston...,"[Legroom:5, Seat comfort:5, In-flight Entertai...",False,1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875..."
5,Singapore Airlines,Aug 2020,I cant rate Singspore Air any higher for their...,[No filling in],True,1,1,1,1,1,1,1,1,"[101, 1045, 2064, 2102, 3446, 10955, 26691, 22..."
6,Singapore Airlines,Aug 2020,They told me it only take 4-6 weeks for the am...,[No filling in],True,0,0,0,0,0,0,0,0,"[101, 2027, 2409, 2033, 2009, 2069, 2202, 1018..."
7,Singapore Airlines,Aug 2020,"This is a lovely airline, the stewardesses in ...",[No filling in],True,1,1,1,1,1,1,1,1,"[101, 2023, 2003, 1037, 8403, 8582, 1010, 1996..."


In [76]:
full_data.iloc[:,:-1].to_csv('./data/full_data.txt', sep='\t', index=False)