In [1]:
import torch
import numpy as np
import pandas as pd
import logging
import time
import torch.nn as nn
import tensorflow as tf 

from tensorflow import keras
from tensorflow.keras import layers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertTokenizerFast, BertModel, AdamW, TFBertModel
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


PRE_TRAINED = 'bert-base-uncased'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


class MyDataset(Dataset):
    def __init__(self, x, y=None):
        super(MyDataset, self).__init__()
        self.x = x
        self.y = y
        self.tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED)
        
    def __getitem__(self, i):
        sen = self.x[i]
        encoded = self.tokenizer.encode(sen)
        encoded = pad_sequences([encoded], maxlen=512, padding='post')
        if self.y is None:
            return torch.FloatTensor(encoded[0])
        else:
            return torch.LongTensor(encoded[0]), torch.FloatTensor([self.y[i]])
    
    def __len__(self):
        return self.x.size
    

class BertBonz(BertModel):
    def __init__(self, config):
        super(BertBonz, self).__init__(config)
        self.config = config
        self.embeddings.add_module('llr_embeddings', nn.Embedding(3, 768, 0))
        self.classifier = nn.Linear(768, 1)
        self.activation = nn.Sigmoid()
        self.init_weights()
        
        
    def forward(self, 
                input_ids=None, 
                llr_ids=None, 
                labels=None, 
                token_type_ids=None, 
                position_ids=None):
        # BERT EMBEDDINGS NEW
        input_shape = input_ids.size()
        seq_length = input_shape[1]
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
        if llr_ids is None:
            llr_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        inputs_embeds = self.embeddings.word_embeddings(input_ids)
        position_embeddings = self.embeddings.position_embeddings(position_ids)
        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
        llr_embeddings = self.embeddings.llr_embeddings(llr_ids)

        embeddings = inputs_embeds + position_embeddings + token_type_embeddings + llr_embeddings
        embeddings = self.embeddings.LayerNorm(embeddings)
        embeddings = self.embeddings.dropout(embeddings)
        
        
        # BERT ENCODER
        encoder_outputs = self.encoder(
            embeddings,
            attention_mask=None,
            head_mask=[None]*12,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            output_attentions=self.config.output_attentions
        )
        sequence_output = encoder_outputs[0]
        
        # CLASSIFIER
        CLS_token = sequence_output[:,0]
        predict = self.activation(self.classifier(CLS_token))
        
        if labels is not None:
            loss = self.loss_fn(predict, labels)
            outputs = (predict, loss, CLS_token, sequence_output) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
        else:
            outputs = (predict, CLS_token, sequence_output) + encoder_outputs[1:]
        return outputs
    
    
    def load_pretrained_weight(self):
        sd = self.state_dict()
        sd_bert_pretrained = BertModel.from_pretrained(PRE_TRAINED).state_dict()
        for k in sd_bert_pretrained.keys():
            if k in sd.keys():
                sd[k] = sd_bert_pretrained[k]
        self.load_state_dict(sd)
        print('Succesfully load pre-trained weights')
        
        
    def fit(self, 
            optimizer=None, 
            loss=None):
        self.optimizer = optimizer(self.parameters(), 2e-5)
        self.loss_fn = loss
        
        
    def train_(self, 
              inputs=None, 
              labels=None, 
              epochs=None, 
              batch_size=None):
        self.to(DEVICE)
        self.train()
        
        for epoch in range(epochs):
            my_dataset = MyDataset(inputs, labels)
            dataloader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)
            s = time.time()
            loss_train = 0
            for x, y in dataloader:
                self.optimizer.zero_grad()
                outputs = self(input_ids=x.to(DEVICE), labels=y.to(DEVICE))
                loss = outputs[1]
                loss.backward()
                self.optimizer.step()
                loss_train += loss.item()
                
                predict = outputs[0]
                print(predict.detach().cpu().numpy().squeeze(-1).tolist())
            print(f'Finish epoch {epoch+1}, loss = {loss_train:.2f}, running time {time.time()-s:.2f}')

    
    
def get_data():
    data = pd.read_csv('data.csv', sep='\t', encoding='utf8', error_bad_lines=False)
    data['sentiment'] = [1 if rating > 3 else 0 for rating in data['Review\'s Star Rating'].values]
    data['num_words'] = [len(content.split(' ')) for content in data['Review\'s Content'].values]
    
    return data


def train(model=None, epochs=None):
    optimizer = AdamW(model.parameters(), lr=1e-5)
    model.train()
    model.to(DEVICE)
    
    for epoch in range(epochs):
        my_dataset = MyDataset(x=data['Review\'s Content'].values, y=data.sentiment.values)
        dataloader = DataLoader(my_dataset, batch_size=4, shuffle=True)
        s = time.time()
        for x, y in dataloader:
            optimizer.zero_grad()
            loss = model(x=x.to(DEVICE), y=y.to(DEVICE))[0]
            loss.backward()
            optimizer.step()
        print('Finish epoch {}, running time {}'.format(epoch+1, time.time()-s))
            
    model.eval()
    predicts=[]
    y_true=[]
    for x, y in dataloader:
        with torch.no_grad():
            predict = model(x=x.to(DEVICE))
        predict = predict.detach().cpu().numpy()
        predict = predict > 0.5
        predicts.extend(predict.tolist())
        y_true.extend(y.numpy().tolist())
        
    print(classification_report(y_true, predicts))
    return model

In [23]:
data = get_data()
data = data.sample(frac=1).reset_index(drop=True).iloc[:2]
config = BertConfig(num_labels=1)

model = BertBonz(config)
model.load_pretrained_weight()
model.fit(optimizer=AdamW, loss=nn.BCELoss())

Succesfully load pre-trained weights


In [24]:
model.train_(inputs=data['Review\'s Content'].values, labels=data.sentiment.values, epochs=3, batch_size=5)

[0.5155052542686462, 0.4985799789428711]
Finish epoch 1, loss = 0.68, running time 0.29
[0.8045018315315247, 0.7413556575775146]
Finish epoch 2, loss = 0.26, running time 0.29
[0.8913280367851257, 0.9040607810020447]
Finish epoch 3, loss = 0.11, running time 0.30


In [18]:
col_names = ['TopNumber', 'AirlineName','ReviewerName','Rating','ReviewDate','ReviewTitle',\
             'ReviewText','Tags', 'DateofTravel', 'Aspects', 'ResponserName', 'ResponseDate', 'ResponseText', 'ReviewerProfileUrl',\
             'AirlineUrl','CrawlTime']
raw_data = pd.read_csv('./data/airline.txt', sep='\t', header=None, names=col_names)

In [19]:
data = raw_data[['ReviewText', 'Rating', 'Aspects']]
sum(data.Rating == 'No filling in')

  res_values = method(rvalues)


0

In [35]:
aspects = data.Aspects[data.Aspects != 'No filling in']
for i in aspects:
    print(i.split('|'))
    break

['Legroom:4', 'Seat comfort:5', 'In-flight Entertainment:5', 'Customer service:5', 'Value for money:5', 'Cleanliness:5', 'Check-in and boarding:5', 'Food and Beverage:5']


In [6]:
t = BertTokenizerFast.from_pretrained(PRE_TRAINED)
len(t.get_vocab())

30522

In [11]:
a = 'Today is a good day submission'
encoded = t.encode(a)

In [2]:
data = get_data()

b'Skipping line 252160: expected 12 fields, saw 13\n'


In [31]:
s = time.time()
b = t.batch_encode_plus(data.ReviewText.values[:100000].tolist())
time.time() - s

7.3368353843688965

In [28]:
s = time.time()
for i in range(1000):
    t.encode(data.ReviewText.values[i:i+1][0])
time.time() - s

0.35000133514404297

In [21]:
data

Unnamed: 0,ReviewText,Rating,Aspects
0,"So, I had this trip aligned for family leisure...",5,Legroom:4|Seat comfort:5|In-flight Entertainme...
1,Refund agreed to months ago but basically been...,1,Legroom:1|Seat comfort:1|In-flight Entertainme...
2,"Flying to London on Singapore Airlines, we had...",4,Legroom:3|Seat comfort:4|In-flight Entertainme...
3,This was the return leg of a weekend trip to M...,3,No filling in
4,I had booked flights to travel with my family ...,4,No filling in
...,...,...,...
175843,ANA is partnered with Air Canada for their fli...,4,Legroom:4|Seat comfort:4|In-flight Entertainme...
175844,This is my first time flying with ANA. Overall...,5,Legroom:4|Seat comfort:4|In-flight Entertainme...
175845,"Excellent Airline to fly with, nice staff and ...",4,Legroom:5|Seat comfort:5|In-flight Entertainme...
175846,We traveled on ANA with our 1 year old baby. I...,5,Legroom:5|Seat comfort:4|In-flight Entertainme...
