In [1]:
import re
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tqdm
import math
import ast

from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import KFold

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
ASPECT_NAMES = ['LEG', 'SIT', 'ENT', 'CUS', 'VOM', 'CLE', 'CKI', 'FNB']
VOCAB_DIC = BertTokenizerFast.from_pretrained(PRE_TRAINED).get_vocab()
TOPN = 50


class MyDataset(Dataset):
    def __init__(self, x, y=None):
        super(MyDataset, self).__init__()
        self.x = x
        self.y = y
        self.tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
        
    def __getitem__(self, i):
        sen = self.x[i]
        encoded = self.tokenizer.encode(sen)
        encoded = pad_sequences([encoded], maxlen=512, padding='post')
        if self.y is None:
            return torch.FloatTensor(encoded[0])
        else:
            return torch.LongTensor(encoded[0]), torch.FloatTensor([self.y[i]])
    
    def __len__(self):
        return self.x.size
    

class BertBonz(BertModel):
    def __init__(self, config):
        super(BertBonz, self).__init__(config)
        self.config = config
        self.embeddings.llr_embeddings = nn.ModuleList(nn.Embedding(4, 768, 3) for _ in range(len(ASPECT_NAMES)))
        self.classifier = nn.Linear(768, config.num_aspect*3)
        self.init_weights()
        
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        
        if llr_ids is not None:
            temp = [self.embeddings.llr_embeddings[i](llr_ids[:,i,:]) for i in range(self.config.num_aspect)]
            llr_embeddings = sum(temp)
        else:
            llr_embeddings = torch.zeros(inputs_embeds.size(), device=device)
        
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.classifier(CLS_token)
        
        loss_fn = nn.CrossEntropyLoss()
        if labels is not None:
            loss = loss_fn(predict.view(input_shape[0], 3,-1), labels)
            outputs = (predict.view(input_shape[0], 3,-1), loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict.view(input_shape[0], 3,-1), CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
    def llr_embed_pad(self):
        for i in range(len(ASPECT_NAMES)):
            temp = self.embeddings.llr_embeddings[i].weight.data
            temp[-1,:] = torch.zeros(temp.size(1))
        
        
    def fit(self, 
            optimizer=None, 
            lr=2e-5,
            loss=None):
        self.optimizer = optimizer(self.parameters(), lr)
        self.loss_fn = loss
        
        
    def train_(self, 
              inputs=None, 
              labels=None, 
              epochs=None, 
              batch_size=None):
        self.to(DEVICE)
        self.train()
        
        for epoch in range(epochs):
            my_dataset = MyDataset(inputs, labels)
            dataloader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)
            s = time.time()
            loss_train = 0
            for x, y in dataloader:
                self.optimizer.zero_grad()
                outputs = self(input_ids=x.to(DEVICE), labels=y.to(DEVICE))
                loss = outputs[1]
                loss.backward()
                self.optimizer.step()
                loss_train += loss.item()
                
                predict = outputs[0]
                print(predict.detach().cpu().numpy().squeeze(-1).tolist())
            print(f'Finish epoch {epoch+1}, loss = {loss_train:.2f}, running time {time.time()-s:.2f}')

    
    
def train(model=None, epochs=None):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    model.train()
    model.to(DEVICE)
    
    for epoch in range(epochs):
        my_dataset = MyDataset(x=data['Review\'s Content'].values, y=data.sentiment.values)
        dataloader = DataLoader(my_dataset, batch_size=4, shuffle=True)
        s = time.time()
        for x, y in dataloader:
            optimizer.zero_grad()
            loss = model(x=x.to(DEVICE), y=y.to(DEVICE))[0]
            loss.backward()
            optimizer.step()
        print('Finish epoch {}, running time {}'.format(epoch+1, time.time()-s))
            
    model.eval()
    predicts=[]
    y_true=[]
    for x, y in dataloader:
        with torch.no_grad():
            predict = model(x=x.to(DEVICE))
        predict = predict.detach().cpu().numpy()
        predict = predict > 0.5
        predicts.extend(predict.tolist())
        y_true.extend(y.numpy().tolist())
        
    print(classification_report(y_true, predicts))
    return model


def split_aspect(data):
    temp = np.full((8, data.shape[0]), 2, np.int)
    for idx in range(data.shape[0]):
        aspect = data[idx]
        for i, asp in enumerate(['Legroom', 'Seat', 'Entertainment', 'Customer', 'Value', 'Cleanliness', 'Check-in', 'Food']):
            for sub_asp in aspect:
                if asp in sub_asp:
                    pol = int(sub_asp[-1])
                    temp[i, idx] = 1 if pol > 3 else 0
                    break
    return temp
            

def tokenize_data(data):
    tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
    input_ids = tokenizer(list(data))['input_ids']
    input_ids = pad_sequences(input_ids, maxlen=512, padding='post', truncating='post')
    
    return list(input_ids)
    
    
def get_data():
    col_names = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
                 'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
                 'AirlineUrl','CrawlTime']
    raw_data = pd.read_csv('./data/airline.txt', sep='\t', header=None, names=col_names)
    data = raw_data[['ReviewText', 'Rating', 'Aspects']]
    data = data[data['Aspects'] != 'No filling in'].reset_index(drop=True) # Filter none aspects
    data.Aspects = data.Aspects.str.split('|').values
    
    '''Split aspects to new columns'''
    aspects_splitted = split_aspect(data.Aspects.values)
    for i in range(len(ASPECT_NAMES)):
        data[ASPECT_NAMES[i]] = aspects_splitted[i,:]
        
    data['input_ids'] = tokenize_data(data.ReviewText.values) # Generate input_ids from review text
    
    return data


def word_class_freq(data, aspect_name, aspect_class=3):
    temp = np.zeros((33000, aspect_class), np.int)
    ids = data.input_ids.values
    labels = data[aspect_name].values

    for sub_ids, sub_lb in zip(ids, labels):
        set_ids = set(sub_ids)
        for ids in set_ids:
            temp[ids, sub_lb] += 1
    
    return temp


def calculate_llr(temp_df, labels):
    N = data.shape[0]
    total_scores = []

    for i in temp_df.index.values:
        llr_scores = []
        for class_ in [0,1,2]:
            num_class_doc = np.sum(labels == class_)
            n11 = temp_df.loc[i, class_]
            n10 = num_class_doc - n11
            n01 = temp_df.loc[i, 'total'] - n11
            n00 = (N - n11 - n10 - n01)
            pt = (1e-10 + n11 + n01)/N
            p1 = n11/(1e-10 + n11 + n10)
            p2 = n01/(1e-10 + n01 + n00)


            try:
                e1 = n11 * (math.log(pt) - math.log(p1))
            except:
                e1 = 0
            try:
                e2 = n10 * (math.log(1-pt) - math.log(1-p1))
            except:
                e2 = 0
            try:
                e3 = n01 * (math.log(pt) - math.log(p2))
            except:
                e3 = 0
            try:
                e4 = n00 * (math.log(1-pt) - math.log(1-p2))
            except:
                e4 = 0

            llr_score = -2 * (e1+e2+e3+e4)
            if n11 < n01:
                llr_score = 0
            llr_scores.append(llr_score)

        total_scores.append(llr_scores)
    
    llr_df = pd.DataFrame(np.array(total_scores), index=temp_df.index, columns=temp_df.columns.values[:-1])

    return llr_df


def generate_llr_score(data, aspect):
    temp = word_class_freq(data, aspect)
    
    temp_df = pd.DataFrame(temp)
    temp_df['total'] = np.sum(temp, -1)
    temp_df = temp_df[temp_df['total'] != 0]
    temp_df = temp_df.drop(0,0)
    
    return calculate_llr(temp_df, data[aspect].values)

Using TensorFlow backend.


In [4]:
data = pd.read_csv('new_data.csv', sep='\t', index_col=0)

for col in tqdm.notebook.tqdm(['input_ids', 'labels', 'llr_embeddings']):
    data[col] = [ast.literal_eval(i) for i in tqdm.notebook.tqdm(data[col].values)]

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=141116.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=141116.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=141116.0), HTML(value='')))





# GET DATA

In [None]:
data = get_data()
data['labels'] = list(data.iloc[:, 3:11].values)
data

# CALCULATE LLR SCORES & WORDLIST

In [4]:
llr_scores = {}
for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Calculate LLR scores'):
    llr_df = generate_llr_score(data, aspect)
    llr_scores[aspect] = llr_df

HBox(children=(FloatProgress(value=0.0, description='Calculate LLR scores', max=8.0, style=ProgressStyle(descr…




In [5]:
llr_words = dict()

for aspect in tqdm.notebook.tqdm(ASPECT_NAMES, desc='Generate top LLR words'):
    kw_label = dict()
    for class_ in [0,1,2]:
        kw_list = list(llr_scores[aspect][class_].sort_values(ascending=False)[:TOPN].index)
        kw_label[class_] = kw_list
    llr_words[aspect] = kw_label

HBox(children=(FloatProgress(value=0.0, description='Generate top LLR words', max=8.0, style=ProgressStyle(des…




In [20]:
llr_embedding_list = []

for idx in tqdm.notebook.trange(data.shape[0]):
    tokens = data.input_ids[idx]
    
    llr_embedding = []
    for aspect in ASPECT_NAMES:
        temp = [3] * tokens.shape[0]
        for j in range(tokens.shape[0]):
            for class_, wordlist in llr_words[aspect].items():
                if tokens[j] in wordlist:
                    temp[j] = class_
                    break
        llr_embedding.append(temp)
    
    llr_embedding_list.append(llr_embedding)

#data['llr_embeddings'] = [[[0]*512]*8] * data.shape[0]
data['llr_embeddings'] = llr_embedding_list

data

HBox(children=(FloatProgress(value=0.0, max=141116.0), HTML(value='')))




Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels,llr_embeddings
0,"So, I had this trip aligned for family leisure...",5,"[Legroom:4, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 2061, 1010, 1045, 2018, 2023, 4440, 1311...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,..."
1,Refund agreed to months ago but basically been...,1,"[Legroom:1, Seat comfort:1, In-flight Entertai...",0,0,0,0,0,0,0,0,"[101, 25416, 8630, 3530, 2000, 2706, 3283, 202...","[0, 0, 0, 0, 0, 0, 0, 0]","[[3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 0, 3, 3, 1, 3,..."
2,"Flying to London on Singapore Airlines, we had...",4,"[Legroom:3, Seat comfort:4, In-flight Entertai...",0,1,1,1,1,1,1,1,"[101, 3909, 2000, 2414, 2006, 5264, 7608, 1010...","[0, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,..."
3,I thought we were making a safe choice booking...,1,[Customer service:1],2,2,2,0,2,2,2,2,"[101, 1045, 2245, 2057, 2020, 2437, 1037, 3647...","[2, 2, 2, 0, 2, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
4,Wonderful service on our trip out to New Zeala...,4,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,1,1,1,"[101, 6919, 2326, 2006, 2256, 4440, 2041, 2000...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141111,ANA is partnered with Air Canada for their fli...,4,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,0,1,1,2,2,2,"[101, 9617, 2003, 12404, 2007, 2250, 2710, 200...","[1, 1, 0, 1, 1, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,..."
141112,This is my first time flying with ANA. Overall...,5,"[Legroom:4, Seat comfort:4, In-flight Entertai...",1,1,1,1,0,2,2,2,"[101, 2023, 2003, 2026, 2034, 2051, 3909, 2007...","[1, 1, 1, 1, 0, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
141113,"Excellent Airline to fly with, nice staff and ...",4,"[Legroom:5, Seat comfort:5, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 6581, 8582, 2000, 4875, 2007, 1010, 3835...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
141114,We traveled on ANA with our 1 year old baby. I...,5,"[Legroom:5, Seat comfort:4, In-flight Entertai...",1,1,1,1,1,2,2,2,"[101, 2057, 6158, 2006, 9617, 2007, 2256, 1015...","[1, 1, 1, 1, 1, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."


# CREATE DATASET & DATALOADER

In [5]:
class BonzDataset(Dataset):
    def __init__(self, data, llr_words):
        self.input_ids = torch.LongTensor(list(data.input_ids))
        self.llr_embeddings = torch.LongTensor(list(data.llr_embeddings))
        if 'labels' in data.columns:
            self.labels = torch.LongTensor(list(data.labels))
        else:
            self.labels = None
        self.llr_words = llr_words
        
    def __len__(self):
        return self.input_ids.shape[0]
    
    def __getitem__(self, idx):
        '''
        tokens = self.data.input_ids[idx]
        
        llr_embedding = []
        for aspect in ASPECT_NAMES:
            temp = [3] * tokens.shape[0]
            for j in range(tokens.shape[0]):
                for class_, wordlist in llr_words[aspect].items():
                    if tokens[j] in wordlist:
                        temp[j] = class_
                        break
            llr_embedding.append(temp)
        
        llr_embedding = torch.stack([torch.LongTensor(i) for i in llr_embedding], 0)
        
        
        outputs = (torch.LongTensor(tokens), llr_embedding)
        
        if 'labels' in self.data.columns:
            outputs = (torch.LongTensor(tokens), llr_embedding, torch.LongTensor(self.data.labels[idx]))
        '''
        if self.labels is None:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx])
        else:
            outputs = (self.input_ids[idx], self.llr_embeddings[idx], self.labels[idx])
        
        return outputs
    

dataset = BonzDataset(data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=7, shuffle=True)

# INITATE MODEL

In [2]:
#del model
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_aspect = len(ASPECT_NAMES)
model = BertBonz(config)
model.to(DEVICE)

model.load_pretrained_weight() # Load pre-trained BERT weights for BERT's layers 
model.llr_embed_pad() # Set LLR embedding padding idx to 0-value tensor

origin_sd = model.state_dict()

Succesfully load pre-trained weights


## Training with K-FOLD

In [None]:
# Training with K-fold

new_data = data.sample(frac=1).reset_index(drop=True)
kf = KFold(10)

last_predict = []
for train_idx, test_idx in tqdm.notebook.tqdm(kf.split(new_data)):
    train_data = new_data.iloc[train_idx]
    test_data = new_data.iloc[test_idx]
    model.load_state_dict(origin_sd)
    print('Load origin state dict succesfully!!!')
    
    """ TRAINING """
    model.train()
    dataset = BonzDataset(train_data.iloc[:,-3:], None)
    dataloader = DataLoader(dataset, batch_size=7, shuffle=True)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    for epoch in tqdm.notebook.trange(5):
        loss_train = 0
        for idx, (a, b, c) in enumerate(dataloader):
            optimizer.zero_grad()
            #a, b, c = a.to(DEVICE), b.to(DEVICE), c.to(DEVICE)
            predict, loss = model(a.to(DEVICE), b.to(DEVICE), c.to(DEVICE))[:2]
            loss.backward()
            optimizer.step()

            loss_train += loss.item()

        print(f'Epoch: {epoch}, Loss = {loss_train:.2f}')
        
    """ TESTING """
    model.eval()
    dataset = BonzDataset(test_data.iloc[:,-3:], None)
    dataloader = DataLoader(dataset, batch_size=40)

    for idx, (a, b, c) in enumerate(dataloader):
        with torch.no_grad():
            predict = model(a.to(DEVICE), b.to(DEVICE))[0]
        last_predict.extend(predict.detach().cpu().numpy().tolist())

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Load origin state dict succesfully!!!


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

In [54]:
torch.save(model.state_dict(), './saved_state_dict/L-BERT_epoch5_lr1e5_padembed_10Fold.pth')

# EVALUATION STEP

In [57]:
"""
#sd = torch.load('./saved_state_dict/epoch5lr2e5.pth')
#model.load_state_dict(sd)
model.eval()

dataset = BonzDataset(new_data.iloc[:,-3:], None)
dataloader = DataLoader(dataset, batch_size=40)

last_predict = []
for idx, (a, b, c) in enumerate(tqdm.notebook.tqdm(dataloader)):
    with torch.no_grad():
        predict = model(a.to(DEVICE), b.to(DEVICE))[0]
    last_predict.extend(predict.detach().cpu().numpy().tolist())
"""

HBox(children=(FloatProgress(value=0.0, max=3528.0), HTML(value='')))




In [None]:
last_predict_ = torch.tensor(last_predict)
last_predict_ = torch.softmax(last_predict_, 1)
y_predict = torch.argmax(last_predict_, 1)
y_true = np.asarray(list(new_data.labels))

for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\n{classification_report(y_true[:,i], y_predict[:,i])}')
    
    
for i, asp in enumerate(ASPECT_NAMES):
    print(f'{asp}:\t{accuracy_score(y_true[:,i], y_predict[:,i]):.2f}\t{f1_score(y_true[:,i], y_predict[:,i], average="macro"):.2f}')

In [63]:
new_data.head()

Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels,llr_embeddings
0,"When I am planning a trip, first thing I do is...",5,"['Legroom:5', 'Seat comfort:5', 'In-flight Ent...",1,1,1,1,1,1,1,1,"[101, 2043, 1045, 2572, 4041, 1037, 4440, 1010...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,Recently flew to Tenerife with Jet2. The fligh...,4,"['Legroom:4', 'Seat comfort:4', 'Customer serv...",1,1,2,1,1,1,1,1,"[101, 3728, 5520, 2000, 2702, 11124, 7959, 200...","[1, 1, 2, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
2,Once again easy airline to use especially if g...,5,"['Legroom:4', 'Seat comfort:5', 'In-flight Ent...",1,1,1,1,1,1,1,1,"[101, 2320, 2153, 3733, 8582, 2000, 2224, 2926...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,..."
3,A truly international airline. The crew repres...,4,"['Legroom:4', 'Seat comfort:4', 'In-flight Ent...",1,1,1,1,0,2,2,2,"[101, 1037, 5621, 2248, 8582, 1012, 1996, 3626...","[1, 1, 1, 1, 0, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3,..."
4,"As always, they are efficient and clever. Boar...",5,"['Legroom:3', 'Seat comfort:3', 'In-flight Ent...",0,0,0,1,1,1,1,0,"[101, 2004, 2467, 1010, 2027, 2024, 8114, 1998...","[0, 0, 0, 1, 1, 1, 1, 0]","[[3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3,..."


In [64]:
data.head()

Unnamed: 0,ReviewText,Rating,Aspects,LEG,SIT,ENT,CUS,VOM,CLE,CKI,FNB,input_ids,labels,llr_embeddings
0,"So, I had this trip aligned for family leisure...",5,"['Legroom:4', 'Seat comfort:5', 'In-flight Ent...",1,1,1,1,1,1,1,1,"[101, 2061, 1010, 1045, 2018, 2023, 4440, 1311...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,..."
1,Refund agreed to months ago but basically been...,1,"['Legroom:1', 'Seat comfort:1', 'In-flight Ent...",0,0,0,0,0,0,0,0,"[101, 25416, 8630, 3530, 2000, 2706, 3283, 202...","[0, 0, 0, 0, 0, 0, 0, 0]","[[3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 0, 3, 3, 1, 3,..."
2,"Flying to London on Singapore Airlines, we had...",4,"['Legroom:3', 'Seat comfort:4', 'In-flight Ent...",0,1,1,1,1,1,1,1,"[101, 3909, 2000, 2414, 2006, 5264, 7608, 1010...","[0, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,..."
3,I thought we were making a safe choice booking...,1,['Customer service:1'],2,2,2,0,2,2,2,2,"[101, 1045, 2245, 2057, 2020, 2437, 1037, 3647...","[2, 2, 2, 0, 2, 2, 2, 2]","[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
4,Wonderful service on our trip out to New Zeala...,4,"['Legroom:5', 'Seat comfort:4', 'In-flight Ent...",1,1,1,1,1,1,1,1,"[101, 6919, 2326, 2006, 2256, 4440, 2041, 2000...","[1, 1, 1, 1, 1, 1, 1, 1]","[[3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
