In [49]:
import re
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tqdm
import math
import ast

from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from apex import amp, optimizers

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ASPECT_NAMES = ['LEG', 'SIT', 'ENT', 'CUS', 'VOM', 'CLE', 'CKI', 'FNB']
VOCAB_DIC = BertTokenizerFast.from_pretrained(PRE_TRAINED).get_vocab()
TOPN = 50


class BertBonz(BertModel):
    def __init__(self, config):
        super(BertBonz, self).__init__(config)
        self.config = config
        self.embeddings.llr_embeddings = nn.ModuleList(nn.Embedding(4, 768, 3) for _ in range(len(ASPECT_NAMES)))
        self.classifier = nn.Linear(768, config.num_aspect*3)
        self.init_weights()
        
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        
        if llr_ids is not None:
            temp = [self.embeddings.llr_embeddings[i](llr_ids[:,i,:]) for i in range(self.config.num_aspect)]
            llr_embeddings = sum(temp)
        else:
            llr_embeddings = torch.zeros(inputs_embeds.size(), device=device).fill_(3).long()
        
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.classifier(CLS_token)
        
        loss_fn = nn.CrossEntropyLoss()
        if labels is not None:
            loss = loss_fn(predict.view(input_shape[0], 3,-1), labels)
            outputs = (predict.view(input_shape[0], 3,-1), loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict.view(input_shape[0], 3,-1), CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
    def llr_embed_pad(self):
        for i in range(len(ASPECT_NAMES)):
            temp = self.embeddings.llr_embeddings[i].weight.data
            temp[-1,:] = torch.zeros(temp.size(1))
        



class BonzDataset(Dataset):
    def __init__(self, data, llr_words):
        self.input_ids = torch.LongTensor(list(data.input_ids))
        self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        if 'labels' in data.columns:
            self.labels = torch.LongTensor(list(data.labels))
        else:
            self.labels = None
        self.llr_words = llr_words
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        '''
        tokens = self.data.input_ids[idx]
        
        llr_embedding = []
        for aspect in ASPECT_NAMES:
            temp = [3] * tokens.shape[0]
            for j in range(tokens.shape[0]):
                for class_, wordlist in llr_words[aspect].items():
                    if tokens[j] in wordlist:
                        temp[j] = class_
                        break
            llr_embedding.append(temp)
        
        llr_embedding = torch.stack([torch.LongTensor(i) for i in llr_embedding], 0)
        
        
        outputs = (torch.LongTensor(tokens), llr_embedding)
        
        if 'labels' in self.data.columns:
            outputs = (torch.LongTensor(tokens), llr_embedding, torch.LongTensor(self.data.labels[idx]))
        '''
        if self.labels is None:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx])
        else:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx], self.labels[idx])
        
        return outputs
    

    
def split_aspect(data):
    temp = np.full((8, data.shape[0]), 2, np.int)
    for idx in range(data.shape[0]):
        aspect = data[idx]
        for i, asp in enumerate(['Legroom', 'Seat', 'Entertainment', 'Customer', 'Value', 'Cleanliness', 'Check-in', 'Food']):
            for sub_asp in aspect:
                if asp in sub_asp:
                    pol = int(sub_asp[-1])
                    temp[i, idx] = 1 if pol > 3 else 0
                    break
    return temp
            

def tokenize_data(data):
    tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
    input_ids = tokenizer(list(data))['input_ids']
    input_ids = pad_sequences(input_ids, maxlen=512, padding='post', truncating='post')
    
    return list(input_ids)
    
    
def get_data():
    col_names = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
                 'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
                 'AirlineUrl','CrawlTime']
    raw_data = pd.read_csv('./data/airline.txt', sep='\t', header=None, names=col_names)
    data = raw_data[['ReviewText', 'Rating', 'Aspects']]
    data = data[data['Aspects'] != 'No filling in'].reset_index(drop=True) # Filter none aspects
    data.Aspects = data.Aspects.str.split('|').values
    
    '''Split aspects to new columns'''
    aspects_splitted = split_aspect(data.Aspects.values)
    for i in range(len(ASPECT_NAMES)):
        data[ASPECT_NAMES[i]] = aspects_splitted[i,:]
        
    data['input_ids'] = tokenize_data(data.ReviewText.values) # Generate input_ids from review text
    
    return data


def word_class_freq(data, aspect_name, aspect_class=3):
    temp = np.zeros((33000, aspect_class), np.int)
    ids = data.input_ids.values
    labels = data[aspect_name].values

    for sub_ids, sub_lb in zip(ids, labels):
        set_ids = set(sub_ids)
        for ids in set_ids:
            temp[ids, sub_lb] += 1
    
    return temp


def calculate_llr(temp_df, labels):
    N = data.shape[0]
    total_scores = []

    for i in temp_df.index.values:
        llr_scores = []
        for class_ in [0,1,2]:
            num_class_doc = np.sum(labels == class_)
            n11 = temp_df.loc[i, class_]
            n10 = num_class_doc - n11
            n01 = temp_df.loc[i, 'total'] - n11
            n00 = (N - n11 - n10 - n01)
            pt = (1e-10 + n11 + n01)/N
            p1 = n11/(1e-10 + n11 + n10)
            p2 = n01/(1e-10 + n01 + n00)


            try:
                e1 = n11 * (math.log(pt) - math.log(p1))
            except:
                e1 = 0
            try:
                e2 = n10 * (math.log(1-pt) - math.log(1-p1))
            except:
                e2 = 0
            try:
                e3 = n01 * (math.log(pt) - math.log(p2))
            except:
                e3 = 0
            try:
                e4 = n00 * (math.log(1-pt) - math.log(1-p2))
            except:
                e4 = 0

            llr_score = -2 * (e1+e2+e3+e4)
            if n11 < n01:
                llr_score = 0
            llr_scores.append(llr_score)

        total_scores.append(llr_scores)
    
    llr_df = pd.DataFrame(np.array(total_scores), index=temp_df.index, columns=temp_df.columns.values[:-1])

    return llr_df


def generate_llr_score(data, aspect):
    temp = word_class_freq(data, aspect)
    
    temp_df = pd.DataFrame(temp)
    temp_df['total'] = np.sum(temp, -1)
    temp_df = temp_df[temp_df['total'] != 0]
    temp_df = temp_df.drop(0,0)
    
    return calculate_llr(temp_df, data[aspect].values)

In [2]:
data = get_data()
data['labels'] = list(data.iloc[:, 3:11].values)
data

Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels
0,"So, I had this trip aligned for family leisure...",5,"[Legroom:4, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2061, 1010, 1045, 2018, 2023, 4440, 1311...","[1, 1, 1, 1, 1, 1, 1, 1]"
1,Refund agreed to months ago but basically been...,1,"[Legroom:1, Seat comfort:1, In-flight Entertai...",0,0,0,0,0,0,0,0,"[101, 25416, 8630, 3530, 2000, 2706, 3283, 202...","[0, 0, 0, 0, 0, 0, 0, 0]"
2,"Flying to London on Singapore Airlines, we had...",4,"[Legroom:3, Seat comfort:4, In-flight Entertai...",0,1,1,1,1,1,1,1,"[101, 3909, 2000, 2414, 2006, 5264, 7608, 1010...","[0, 1, 1, 1, 1, 1, 1, 1]"
3,I thought we were making a safe choice booking...,1,[Customer service:1],2,2,2,0,2,2,2,2,"[101, 1045, 2245, 2057, 2020, 2437, 1037, 3647...","[2, 2, 2, 0, 2, 2, 2, 2]"
4,Wonderful service on our trip out to New Zeala...,4,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 6919, 2326, 2006, 2256, 4440, 2041, 2000...","[1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141111,ANA is partnered with Air Canada for their fli...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,0,1,1,2,2,2,"[101, 9617, 2003, 12404, 2007, 2250, 2710, 200...","[1, 1, 0, 1, 1, 2, 2, 2]"
141112,This is my first time flying with ANA. Overall...,5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,0,2,2,2,"[101, 2023, 2003, 2026, 2034, 2051, 3909, 2007...","[1, 1, 1, 1, 0, 2, 2, 2]"
141113,"Excellent Airline to fly with, nice staff and ...",4,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6581, 8582, 2000, 4875, 2007, 1010, 3835...","[1, 1, 1, 1, 1, 2, 2, 2]"
141114,We traveled on ANA with our 1 year old baby. I...,5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2057, 6158, 2006, 9617, 2007, 2256, 1015...","[1, 1, 1, 1, 1, 2, 2, 2]"


In [35]:
from sklearn.svm import SVC 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.ReviewText.values)
labels = np.array(torch.tensor(data.labels))

In [None]:
kf = KFold(10)
last_predict = []

for train_idx, test_idx in tqdm.notebook.tqdm(kf.split(X)):
    # Take train and test data
    x_train = X[train_idx]
    y_train = labels[train_idx]
    x_test = X[test_idx]
    y_test = labels[test_idx]
    
    # Initate model
    clf = SVC()
    multi_clf = MultiOutputClassifier(clf)
    
    multi_clf.fit(x_train, y_train)
    predicted = multi_clf.predict(x_test).tolist()
    last_predict.extend(predicted)
    
    

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [55]:
y_true = labels
y_predict = np.array(torch.tensor(last_predict))


for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\n{classification_report(y_true[:,i], y_predict[:,i])}')
    
    
for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\t{precision_score(y_true[:,i], y_predict[:,i], average="macro"):.2f}\t\
{recall_score(y_true[:,i], y_predict[:,i], average="macro"):.2f}\t\
{f1_score(y_true[:,i], y_predict[:,i], average="macro"):.2f}\t\
{accuracy_score(y_true[:,i], y_predict[:,i]):.2f}')

LEG:
              precision    recall  f1-score   support

           0       0.69      0.06      0.11     38111
           1       0.73      0.99      0.84    101257
           2       0.00      0.00      0.00      1748

    accuracy                           0.73    141116
   macro avg       0.47      0.35      0.32    141116
weighted avg       0.71      0.73      0.63    141116



  _warn_prf(average, modifier, msg_start, len(result))


SIT:
              precision    recall  f1-score   support

           0       0.71      0.11      0.19     38710
           1       0.74      0.99      0.84    100884
           2       0.00      0.00      0.00      1522

    accuracy                           0.74    141116
   macro avg       0.48      0.37      0.35    141116
weighted avg       0.72      0.74      0.66    141116

ENT:
              precision    recall  f1-score   support

           0       0.55      0.24      0.33     42524
           1       0.64      0.95      0.76     82363
           2       0.46      0.01      0.01     16229

    accuracy                           0.63    141116
   macro avg       0.55      0.40      0.37    141116
weighted avg       0.59      0.63      0.55    141116

CUS:
              precision    recall  f1-score   support

           0       0.86      0.14      0.24     22629
           1       0.85      1.00      0.91    116697
           2       0.00      0.00      0.00      1790

    a

  _warn_prf(average, modifier, msg_start, len(result))


SIT:	0.48	0.37	0.35	0.74
ENT:	0.55	0.40	0.37	0.63
CUS:	0.57	0.38	0.39	0.85
VOM:	0.53	0.37	0.36	0.79
CLE:	0.59	0.33	0.28	0.70
CKI:	0.63	0.34	0.29	0.69
FNB:	0.53	0.39	0.34	0.56


In [56]:
torch.save(torch.tensor(y_predict), './result/RF.pt')

In [41]:
X.shape

(141116, 63789)