In [1]:
# https://medium.com/udacity-pytorch-challengers/ideas-on-how-to-fine-tune-a-pre-trained-model-in-pytorch-184c47185a20

import re
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tqdm
import math
import ast
import nltk

from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from apex import amp, optimizers

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

COL_NAMES = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
             'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
             'AirlineNation', 'CrawlTime']

PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ASPECT_NAMES = ['LEG', 'SIT', 'ENT', 'CUS', 'VOM', 'CLE', 'CKI', 'FNB']
VOCAB_DIC = BertTokenizerFast.from_pretrained(PRE_TRAINED).get_vocab()
TOPN = 50


        
# This one is implemented with weight loss per class            
class BertBonzWeightLoss(BertModel):
    def __init__(self, config):
        super(BertBonzWeightLoss, self).__init__(config)
        self.config = config
        self.embeddings.llr_embeddings = nn.ModuleList(nn.Embedding(4, 768, 3) for _ in range(len(ASPECT_NAMES)))
        self.classifier = nn.Linear(768, config.num_aspect*3)
        self.init_weights()
        self.embeddings.llr_embeddings.apply(self._xavier)
        self.pooler.apply(self._xavier)
        self.classifier.apply(self._xavier)
        
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None,
                weight_loss=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        
        if llr_ids is not None:
            temp = [self.embeddings.llr_embeddings[i](llr_ids[:,i,:]) for i in range(self.config.num_aspect)]
            llr_embeddings = sum(temp)
        else:
            llr_embeddings = torch.zeros(inputs_embeds.size(), device=device).fill_(3).long()
        
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.classifier(CLS_token)
        
        loss_fn = nn.functional.cross_entropy
        if labels is not None:
            if weight_loss is None:
                loss = loss_fn(predict.view(input_shape[0], 3,-1), labels)
            else:
                loss = torch.tensor(0).float().to(DEVICE)
                for asp_i in range(len(ASPECT_NAMES)):
                    loss += loss_fn(predict.view(input_shape[0], 3,-1)[:,:,asp_i], labels[:,asp_i], weight_loss[asp_i, :])
                loss /= len(ASPECT_NAMES)
                    
            outputs = (predict.view(input_shape[0], 3,-1), loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict.view(input_shape[0], 3,-1), CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
    def llr_embed_pad(self):
        for i in range(len(ASPECT_NAMES)):
            temp = self.embeddings.llr_embeddings[i].weight.data
            temp[-1,:] = torch.zeros(temp.size(1))
            
    def _xavier(self, module):
        for name, param in module.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)
            elif 'bias' in name:
                param.data.zero_()
                
    def unfreeze(self):
        for param in self.parameters():
            param.requires_grad = True
                
    def freeze(self):
        for param in self.parameters():
            param.requires_grad = False
        for param in self.embeddings.llr_embeddings.parameters():
            param.requires_grad = True
        for param in self.pooler.parameters():
            param.requires_grad = True
        for param in self.classifier.parameters():
            param.requires_grad = True
    


class BonzDataset(Dataset):
    def __init__(self, data, llr_words):
        self.input_ids = torch.LongTensor(list(data.input_ids))
        self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        if 'labels' in data.columns:
            self.labels = torch.LongTensor(list(data.labels))
        else:
            self.labels = None
        self.llr_words = llr_words
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        '''
        tokens = self.data.input_ids[idx]
        
        llr_embedding = []
        for aspect in ASPECT_NAMES:
            temp = [3] * tokens.shape[0]
            for j in range(tokens.shape[0]):
                for class_, wordlist in llr_words[aspect].items():
                    if tokens[j] in wordlist:
                        temp[j] = class_
                        break
            llr_embedding.append(temp)
        
        llr_embedding = torch.stack([torch.LongTensor(i) for i in llr_embedding], 0)
        
        
        outputs = (torch.LongTensor(tokens), llr_embedding)
        
        if 'labels' in self.data.columns:
            outputs = (torch.LongTensor(tokens), llr_embedding, torch.LongTensor(self.data.labels[idx]))
        '''
        if self.labels is None:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx])
        else:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx], self.labels[idx])
        
        return outputs
    

    
def split_aspect(data):
    temp = np.full((8, data.shape[0]), 2, np.int)
    for idx in range(data.shape[0]):
        aspect = data[idx]
        for i, asp in enumerate(['Legroom', 'Seat', 'Entertainment', 'Customer', 'Value', 'Cleanliness', 'Check-in', 'Food']):
            for sub_asp in aspect:
                if asp in sub_asp:
                    pol = int(sub_asp[-1])
                    temp[i, idx] = 1 if pol > 3 else 0
                    break
    return temp
            

def tokenize_data(data):
    tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
    input_ids = tokenizer(list(data))['input_ids']
    input_ids = pad_sequences(input_ids, maxlen=512, padding='post', truncating='post')
    
    return (list(input_ids), tokenizer)
    
    
def get_data(FILE_PATH, COL_NAMES):
    raw_data = pd.read_csv(FILE_PATH, sep='\t', header=None, names=COL_NAMES)
    data = raw_data[['ReviewText', 'Rating', 'Aspects']]
    data = data[data['Aspects'] != 'No filling in'] # Filter none aspects
    data.Aspects = data.Aspects.str.split('|').values
    
    '''Split aspects to new columns'''
    aspects_splitted = split_aspect(data.Aspects.values)
    for i in range(len(ASPECT_NAMES)):
        data[ASPECT_NAMES[i]] = aspects_splitted[i,:]
        
    data['input_ids'], tokenizer = tokenize_data(data.ReviewText.values) # Generate input_ids from review text
    
    return data, tokenizer


def word_class_freq(data, aspect_name, aspect_class=3):
    temp = np.zeros((33000, aspect_class), np.int)
    ids = data.input_ids.values
    labels = data[aspect_name].values

    for sub_ids, sub_lb in zip(ids, labels):
        set_ids = set(sub_ids)
        for ids in set_ids:
            temp[ids, sub_lb] += 1
    
    return temp


def calculate_llr(temp_df, labels):
    N = data.shape[0]
    total_scores = []

    for i in temp_df.index.values:
        llr_scores = []
        for class_ in [0,1,2]:
            num_class_doc = np.sum(labels == class_)
            n11 = temp_df.loc[i, class_]
            n10 = num_class_doc - n11
            n01 = temp_df.loc[i, 'total'] - n11
            n00 = (N - n11 - n10 - n01)
            pt = (1e-10 + n11 + n01)/N
            p1 = n11/(1e-10 + n11 + n10)
            p2 = n01/(1e-10 + n01 + n00)


            try:
                e1 = n11 * (math.log(pt) - math.log(p1))
            except:
                e1 = 0
            try:
                e2 = n10 * (math.log(1-pt) - math.log(1-p1))
            except:
                e2 = 0
            try:
                e3 = n01 * (math.log(pt) - math.log(p2))
            except:
                e3 = 0
            try:
                e4 = n00 * (math.log(1-pt) - math.log(1-p2))
            except:
                e4 = 0

            llr_score = -2 * (e1+e2+e3+e4)
            if n11 < n01:
                llr_score = 0
            llr_scores.append(llr_score)

        total_scores.append(llr_scores)
    
    llr_df = pd.DataFrame(np.array(total_scores), index=temp_df.index, columns=temp_df.columns.values[:-1])

    return llr_df


def generate_llr_score(data, aspect):
    temp = word_class_freq(data, aspect)
    
    temp_df = pd.DataFrame(temp)
    temp_df['total'] = np.sum(temp, -1)
    temp_df = temp_df[temp_df['total'] != 0]
    temp_df = temp_df.drop(0,0)
    
    return calculate_llr(temp_df, data[aspect].values)

Using TensorFlow backend.


In [2]:
data, tokenizer = get_data('./data/data_v3.txt', COL_NAMES)
data['labels'] = list(data.iloc[:, 3:11].values)

data

Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels
0,With everyone trying to get home in the Covid ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2007, 3071, 2667, 2000, 2131, 2188, 1999...","[1, 1, 1, 1, 1, 1, 1, 1]"
1,"Ad a lot of people did, we had to scramble to ...",5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 4748, 1037, 2843, 1997, 2111, 2106, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]"
2,After coming into Changi airport and worrying ...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2044, 2746, 2046, 11132, 2072, 3199, 199...","[1, 1, 1, 1, 1, 1, 1, 1]"
3,"Great service, great plane, great pricing. We ...",4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2307, 2326, 1010, 2307, 4946, 1010, 2307...","[1, 1, 1, 1, 1, 1, 1, 1]"
4,My husband and I were to fly home from Houston...,1,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,0,1,1,1,1,"[101, 2026, 3129, 1998, 1045, 2020, 2000, 4875...","[1, 1, 1, 0, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190981,We booked to fly from Heathrow to Newark. The ...,1,"[Legroom:2, Seat comfort:2, In-flight Entertai...",0,0,0,0,0,2,2,2,"[101, 2057, 17414, 2000, 4875, 2013, 9895, 105...","[0, 0, 0, 0, 0, 2, 2, 2]"
190982,"Love Virgin, great staff, food good, quality o...",5,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2293, 6261, 1010, 2307, 3095, 1010, 2833...","[1, 1, 1, 1, 1, 2, 2, 2]"
190983,"Virgin upper class is outstanding, really very...",5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6261, 3356, 2465, 2003, 5151, 1010, 2428...","[1, 1, 1, 1, 1, 2, 2, 2]"
190984,Virgins premium economy is the best I have com...,5,"[Legroom:3, Seat comfort:5, In-flight Entertai...",0,1,1,0,0,2,2,2,"[101, 6261, 2015, 12882, 4610, 2003, 1996, 219...","[0, 1, 1, 0, 0, 2, 2, 2]"


In [3]:
from sklearn.svm import SVC 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.ReviewText.values)
labels = np.array([i.tolist() for i in data.labels])

In [4]:
kf = KFold(10)

method_predict = []

for method in [DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]:
    # TRAINING PHASE
    last_predict = []
    for train_idx, test_idx in tqdm.notebook.tqdm(kf.split(X)):
        # Take train and test data
        x_train = X[train_idx]
        y_train = labels[train_idx]
        x_test = X[test_idx]
        y_test = labels[test_idx]

        # Initate model
        clf = method
        multi_clf = MultiOutputClassifier(clf)

        multi_clf.fit(x_train, y_train)
        predicted = multi_clf.predict(x_test).tolist()
        last_predict.extend(predicted)
    
    # SAVE PREDICT
    method_predict.append(torch.tensor(last_predict))
    
    # VALIDATION PHASE
    y_true = labels
    y_predict = np.array(torch.tensor(last_predict))

    for i, asp in enumerate(ASPECT_NAMES):
        print(f'{asp},\t{precision_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {recall_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {f1_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
    {accuracy_score(y_true[:,i], y_predict[:,i])*100:.2f}')


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


LEG,	40.51,    40.10,    40.27,    66.29
SIT,	41.83,    41.30,    41.52,    67.91
ENT,	44.77,    44.48,    44.60,    58.15
CUS,	45.83,    45.41,    45.61,    81.21
VOM,	42.60,    42.44,    42.50,    72.53
CLE,	39.81,    39.38,    39.57,    58.41
CKI,	40.31,    40.02,    40.16,    57.43
FNB,	39.93,    39.88,    39.90,    46.14


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


LEG,	46.93,    35.15,    31.88,    72.85


  _warn_prf(average, modifier, msg_start, len(result))


SIT,	48.50,    36.97,    35.20,    73.76
ENT,	51.36,    37.74,    34.42,    63.59
CUS,	57.15,    38.15,    38.94,    84.59
VOM,	52.55,    37.45,    36.71,    78.66
CLE,	48.47,    33.45,    27.63,    69.65
CKI,	62.72,    34.41,    29.31,    68.89
FNB,	51.58,    39.03,    33.79,    56.23


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


LEG,	53.44,    39.27,    39.68,    70.66
SIT,	54.27,    40.27,    40.87,    71.25
ENT,	47.86,    42.39,    42.37,    61.61
CUS,	46.87,    45.76,    46.22,    83.33
VOM,	47.08,    42.98,    43.02,    76.36
CLE,	42.08,    39.79,    38.27,    66.16
CKI,	43.32,    43.25,    40.51,    64.59
FNB,	42.70,    41.87,    39.64,    52.43


In [None]:
torch.save(torch.tensor(y_predict), './result/RF.pt')

In [None]:
for y_predict in method_predict:    
    for i, asp in enumerate(ASPECT_NAMES):
        print(f'{asp}, {precision_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
        {recall_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
        {f1_score(y_true[:,i], y_predict[:,i], average="macro")*100:.2f},\
        {accuracy_score(y_true[:,i], y_predict[:,i])*100:.2f}')
    print(f'------------\n')