In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import torch
import transformers
import inspect
import time
import logging
import tmunlp as nlp
import random

from tqdm import trange, tqdm, tqdm_notebook, tqdm_pandas, tqdm_gui
from datetime import datetime
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, AdamW, BertPreTrainedModel
from transformers import get_constant_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score
from torch import nn
from torch.nn import MSELoss, CrossEntropyLoss

FOLDER_PATH = './dataset/new/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 2080 Ti'

# 1. Load data & pre-processing

In [7]:
def preprocessing(df):
    df.polarity = [label_dict[i] for i in df.polarity]
    return df

def shuffle(df):
    index = [i for i in range(df.shape[0])]
    random.shuffle(index)
    df = df.set_index([index]).sort_index()
    return df


train = pd.read_csv(FOLDER_PATH+'train.txt', sep='\t', names=['polarity','sentence'])
test = pd.read_csv(FOLDER_PATH+'test.txt', sep='\t', names=['polarity','sentence'])

train = shuffle(train)
test = shuffle(test)

label_list = sorted(list(set(train.polarity)))
label_dict = {}
for i in range(len(label_list)):
    label_dict[label_list[i]] = i

train = preprocessing(train)
test = preprocessing(test)
'''
def combine(df):
    df.sentence = [df.title[i]+' '+str(df.sentence[i]) for i in range(df.shape[0])]
    return df

train = pd.read_csv('./dataset/reader_emotion/train.txt', sep='\t', names=['polarity', 'title', 'sentence'], header=None)
#train = train.append([train]*2, ignore_index=True)
test = pd.read_csv('./dataset/reader_emotion/test.txt', sep='\t', names=['polarity', 'title', 'sentence'], header=None)

train = shuffle(train)
test = shuffle(test)

label_list = sorted(list(set(train.polarity)))
label_dict = {}
for i in range(len(label_list)):
    label_dict[label_list[i]] = i

train = preprocessing(train)
test = preprocessing(test)

train = combine(train)
test = combine(test)
'''

print('Label dict:\n{}\n Train data:\n{}\n\n Test data:\n{}'.format(label_dict, train.head(5), test.head(5)))

Label dict:
{'edu': 0, 'health': 1, 'politics': 2, 'sports': 3, 'tech': 4, 'travel': 5}
 Train data:
   polarity                                           sentence
0         1  半數癌症 疑是不良習慣導致 新頭殼newtalk 2011.12.07 徐千雅/綜合報導一項...
1         3  MLB／葛蘭基若跳脫合約　身價可望再攀新高 記者吳婷雯／綜合報導今年季末花錢補強不手軟的洛杉...
2         5  苑裡油菜花季 花田玩藝文 【聯合報╱記者祁容玉╱苑裡報導】 苑裡首度舉辦「稻苑裡賞花趣」油菜...
3         4  營運不佳 諾基亞分割壓力大增 不到2個月前，諾基亞執行長埃洛普(Stephen Flop)在...
4         5  苗栗╱拱天宮香客大樓 可遠眺好望角海岸 【聯合報╱記者祁容玉╱通霄報導】 頂樓可遠眺後龍好望...

 Test data:
   polarity                                           sentence
0         3  火箭遭蜂螫 苦吞6連敗 （路透加州聖克拉拉19日電）美國職籃NBA西區球隊休士頓火箭（Roc...
1         4  奧運商機 中國電信推手機遊戲 （中央社記者鄭崇生上海26日電）倫敦奧運在大陸手機遊戲市場也有...
2         4  超級月亮 台灣白天無緣見 當晚好天氣才有機會賞月〔自由時報記者林嘉琪／台北報導〕今年首波梅雨...
3         5  網紋陶片出土 疑牛罵頭遺址   台中市日前在安和重劃區內，發現大批陶片，研判可能是牛罵頭文化...
4         3  國民隊成敗 就看王建民肩膀 記者陳浚錡／綜合報導華盛頓國民隊今年先發投手陣容齊全，堪稱本季全...


# 2. Build BERT model

In [8]:
NUM_LABELS = len(label_list)

class BertForSequenceClassificationBonz(BertPreTrainedModel):
   
    def __init__(self, config):
        super(BertForSequenceClassificationBonz, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        weights=None
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss(weight=weights)
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

class BertModelBonz():
    def __init__(self, model='bert-base-chinese', batch_size=6, num_labels=2):
        self.pre_trained_model = model
        self.num_labels = num_labels
        self.config = BertConfig.from_pretrained(self.pre_trained_model, output_hidden_states=True, num_labels=self.num_labels)
        
        self.batch_size = batch_size
        self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_model)
        
        #self.model = BertForSequenceClassification(config=self.config)
        self.model = BertForSequenceClassificationBonz.from_pretrained(self.pre_trained_model, config=self.config)
        self.max_len = self.model.config.max_position_embeddings
        self.optimizer = AdamW(params = self.model.parameters(), lr=1e-5)
        
        self.train_loss = []
        self.train_accuracy = []
        self.test_f1 = []
        self.test_accuracy = []
    
    def create_ids(self, sentences):
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) #Disable tokenizer logs, it's really annoy
        input_ids = []
        for sen in tqdm(sentences, desc="Create Ids"):
            tmp = self.tokenizer.encode(sen)
            input_ids.append(tmp)
        input_ids = pad_sequences(input_ids, 
                                  maxlen=self.max_len, 
                                  dtype='int64', 
                                  truncating='post', 
                                  padding='post')
        return input_ids
    
    def prepare_data(self, input_ids, input_labels=None):
        input_ids = torch.tensor(self.create_ids(input_ids))
        if input_labels is None:
            return DataLoader(TensorDataset(input_ids), 
                              batch_size=self.batch_size,
                              shuffle=True
                             )
        else:
            input_labels = torch.tensor(input_labels)
            return DataLoader(TensorDataset(input_ids, input_labels), 
                              batch_size=self.batch_size
                             )
    
    def train(self, train_dataloader, test_dataloader=None, weights=None, epochs=4):
        self.model.to(device)
        for i in trange(epochs, desc="Epoch"):
            # Training model
            self.model.train()
            tr_loss = []
            
            for input_ids, input_labels in tqdm_notebook(train_dataloader, desc='Training'):
                self.optimizer.zero_grad()
                loss = self.model(input_ids=input_ids.cuda(), labels=input_labels.cuda(), weights=weights.cuda())[0] 
                loss.backward()
                self.optimizer.step()
                
                tr_loss.append(loss.item())
            
            loss_score = sum(tr_loss)/len(tr_loss)
            self.train_loss.append(loss_score)

            # Evaluation
            self.model.eval()
            predictions = []
            labels = []
            
            for input_ids, input_labels in tqdm_notebook(train_dataloader, desc='Evaluating'):
                with torch.no_grad():
                    logits = self.model(input_ids=input_ids.cuda())[0]
                logits = logits.detach().cpu().numpy()
                predictions.extend(logits)
                labels.extend(input_labels)
            
            predictions = np.argmax(predictions, axis=1)
            acc_score = accuracy_score(labels, predictions)
            self.train_accuracy.append(acc_score)
            
            # Print result
            print('EPOCH', i)
            print('Train loss: ', loss_score)
            print('Train accuracy: ',acc_score)
            print(classification_report(labels, predictions, digits=4))
            
            #Save model for each epoch:
            filename = 'bert_512_epoch'+str(i)+'.sd'
            filepath = FOLDER_PATH+filename
            torch.save(self.model.state_dict(), filepath)
            
            if test_dataloader is not None:
                # Predict test data
                self.model.eval()
                predictions = []

                for input_ids, input_labels in tqdm_notebook(test_dataloader, desc="Predicting"):
                    with torch.no_grad():
                        logits = self.model(input_ids=input_ids.cuda())[0] #This is for generate predict only
                    logits = logits.detach().cpu().numpy()
                    predictions.extend(logits)

                predictions = np.argmax(predictions, axis=1)

                self.test_f1.append(f1_score(test.polarity, predictions, average='macro'))
                self.test_accuracy.append(accuracy_score(test.polarity, predictions))
                
                print(classification_report(test.polarity, predictions, digits=4))
            
    def generate_cls_vectors(self, dataloader):
        self.model.to('cuda')
        self.model.eval()
        cls_vectors = []
        for input_ids, input_labels in tqdm_notebook(dataloader):
            with torch.no_grad():
                outputs = self.model(input_ids.cuda())
                last_hidden_layer = outputs[1][12]
                cls_vector = last_hidden_layer[:,0,:]
            cls_vector = cls_vector.detach().cpu().numpy()
            cls_vectors.extend(cls_vector)
        return cls_vectors
                
   
        
        
#Create model
bert_model = BertModelBonz(model='bert-base-chinese', batch_size=6, num_labels=NUM_LABELS)
bert_model.model.config

{
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 6,
  "output_attentions": false,
  "output_hidden_states": true,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 21128
}

## 2.1. Create train & test dataloader

In [9]:
train_dataloader = bert_model.prepare_data(input_ids=train.sentence[:6], input_labels=train.polarity[:6])
test_dataloader = bert_model.prepare_data(input_ids=test.sentence[:6], input_labels=test.polarity[:6])

Create Ids: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 154.26it/s]
Create Ids: 100%|███████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 240.64it/s]


## 2.2. Train model & Predict data

In [11]:
#Train model

bert_model.train(train_dataloader, 
                 test_dataloader,
                 weights=torch.tensor([4,4,1,1,1,2], dtype=torch.float),
                 epochs=4)




Epoch:   0%|                                                                                     | 0/4 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Training', max=1, style=ProgressStyle(description_width='init…







RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'weight' in call to _thnn_nll_loss_forward

In [23]:
print(bert_model.train_loss, '\n', bert_model.train_accuracy, '\n', bert_model.test_f1, '\n', bert_model.test_accuracy)

[1.8303304069182451, 1.5800249225953047, 1.4178658303092508, 1.1401410874198465, 0.32545239252850144, 0.21437513451076115, 0.1526120551652296, 0.11626514505056765] 
 [0.39, 0.55, 0.76, 0.92, 0.9266406692664066, 0.9459554044595541, 0.96000399960004, 0.9724027597240276] 
 [0.711570464874406] 
 [0.7460433312845728]


In [13]:
test_dataloader = bert_model.prepare_data(input_ids=test.sentence, input_labels=test.polarity)

Create Ids: 100%|███████████████████████████████████████████████████████████████| 78096/78096 [06:54<00:00, 198.50it/s]


In [14]:
bert_model.model.eval()
predictions = []

for input_ids, input_labels in tqdm_notebook(test_dataloader, desc="Predicting"):
    with torch.no_grad():
        logits = bert_model.model(input_ids=input_ids.cuda())[0] #This is for generate predict only
    logits = logits.detach().cpu().numpy()
    predictions.extend(logits)

predictions = np.argmax(predictions, axis=1)

bert_model.test_f1.append(f1_score(test.polarity, predictions, average='macro'))
bert_model.test_accuracy.append(accuracy_score(test.polarity, predictions))

print(classification_report(test.polarity, predictions, digits=4))

HBox(children=(IntProgress(value=0, description='Predicting', max=13016, style=ProgressStyle(description_width…


              precision    recall  f1-score   support

           0     0.2989    0.8794    0.4461      5023
           1     0.7195    0.9764    0.8285      5844
           2     0.8041    0.8508    0.8268     19023
           3     0.9270    0.8802    0.9030     18919
           4     0.8774    0.6247    0.7298     17031
           5     0.9019    0.3805    0.5352     12256

   micro avg     0.7460    0.7460    0.7460     78096
   macro avg     0.7548    0.7653    0.7116     78096
weighted avg     0.8264    0.7460    0.7540     78096



## 2.3. Generate CLS vectors

In [8]:
# Load state dict from trained model
#filepath = './dataset/reader_emotion/bert512_epoch3.sd'
#bert_model.model.load_state_dict(torch.load(filepath))

cls_vectors = bert_model.generate_cls_vectors(train_dataloader)
cls_vectors_test = bert_model.generate_cls_vectors(test_dataloader)

HBox(children=(IntProgress(value=0, max=1946), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5934), HTML(value='')))




# 3. Feature Extraction with TMUNLP

## 3.1. Build negative & positive vectors

In [11]:
label_list = ['0', '1']

result = nlp.get_label_term_weighting('./dataset/imdb/tmunlp_file.txt', label_list)
VECTOR_LEN = 70

#Create negative word list
negative_list =  nlp.get_keyword('0', result, VECTOR_LEN)
temp_max = negative_list[list(negative_list.keys())[0]]
for i in negative_list.keys():
    negative_list[i] = negative_list[i] #/ temp_max


#Create postive word list
positive_list =  nlp.get_keyword('1', result, VECTOR_LEN)
temp_max = positive_list[list(positive_list.keys())[0]]
for i in positive_list.keys():
    positive_list[i] = positive_list[i] #/ temp_max



def embedding_1hot(df, words):
    arr = []
    for i in range(df.shape[0]):
        temp = [words[word] if word in df.sentence[i] else 0 for word in words.keys()]
        arr.append(temp)
    return arr

train['negative_embedding'] = embedding_1hot(train, negative_list)
train['positive_embedding'] = embedding_1hot(train, positive_list)

test['negative_embedding'] = embedding_1hot(test, negative_list)
test['positive_embedding'] = embedding_1hot(test, positive_list)

## 3.2. Build bert_llr model

In [12]:
class BertLLR(nn.Module):
    def __init__(self, VECTOR_LEN):
        super(BertLLR, self).__init__()
        self.bert = nn.Linear(768,768)
        self.bert_activation = nn.Tanh()
        
        self.llr = nn.Linear(2*VECTOR_LEN, 2*VECTOR_LEN)
        self.llr_activation = nn.Tanh()
        
        self.dropout = nn.Dropout(0.1)
        
        self.classifier = nn.Linear(768+2*VECTOR_LEN, 2)
    
    def forward(self, cls_vectors=None, neg_embed=None, pos_embed=None, labels=None):
        tanh_cls_vectors = self.bert_activation(self.bert(cls_vectors))
        
        llr_vectors = torch.cat([neg_embed, pos_embed], dim=1)
        tanh_llr_vectors = self.llr_activation(self.llr(llr_vectors))
        
        concat_vectors = torch.cat([tanh_cls_vectors, tanh_llr_vectors], dim=1)
        concat_vectors = self.dropout(concat_vectors)
        
        logits = self.classifier(concat_vectors)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            outputs = (logits, loss)
        else:
            outputs = (logits,)
        
        return outputs
    
bert_llr_model = BertLLR(VECTOR_LEN)
bert_llr_optimizer = torch.optim.Adam(params = bert_llr_model.parameters(), 
                                      lr = 1e-5
                                     )


bert_llr_model   

BertLLR(
  (bert): Linear(in_features=768, out_features=768, bias=True)
  (bert_activation): Tanh()
  (llr): Linear(in_features=140, out_features=140, bias=True)
  (llr_activation): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=908, out_features=2, bias=True)
)

## 3.3. Train bert_llr model

In [13]:
dataloader = DataLoader(TensorDataset(torch.tensor(cls_vectors, dtype=torch.float),
                                      torch.tensor(train.negative_embedding, dtype=torch.float),
                                      torch.tensor(train.positive_embedding, dtype=torch.float),
                                      torch.tensor(train.polarity)
                                     ),
                        batch_size=16
                       )

dataloader_test = DataLoader(TensorDataset(torch.tensor(cls_vectors_test, dtype=torch.float),
                                          torch.tensor(test.negative_embedding, dtype=torch.float),
                                          torch.tensor(test.positive_embedding, dtype=torch.float),
                                          torch.tensor(test.polarity)
                                         ),
                            batch_size=16
                           )


bert_llr_model.to('cuda')
train_loss = []
train_accuracy = []
test_f1 = []
test_accuracy = []
for _ in range(10):
    # Training
    bert_llr_model.train()
    tr_loss = []
    for cls_vector, neg_embed, pos_embed, labels in tqdm(dataloader, desc='Training'):
        #print(labels)
        bert_llr_optimizer.zero_grad()
        logits, loss = bert_llr_model(cls_vectors = cls_vector.cuda(), 
                                      neg_embed = neg_embed.cuda(), 
                                      pos_embed = pos_embed.cuda(), 
                                      labels = labels.cuda())
        loss.backward()
        bert_llr_optimizer.step()
        tr_loss.append(loss.item())
    loss_score = sum(tr_loss)/len(tr_loss)
    train_loss.append(loss_score)
    
    # Evaluation
    bert_llr_model.eval()
    predictions = []
    for cls_vector, neg_embed, pos_embed, labels in tqdm(dataloader, desc='Predicting'):
        with torch.no_grad():
            logits = bert_llr_model(cls_vectors = cls_vector.cuda(),
                                    neg_embed = neg_embed.cuda(),
                                    pos_embed = pos_embed.cuda())[0] #This is for generate predict only
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    predictions = [j for i in predictions for j in i]
    predictions = np.argmax(predictions, axis=1)
    acc_score = accuracy_score(train.polarity, predictions)
    train_accuracy.append(acc_score)
    
    # Predict data
    predictions = []
    for cls_vector, neg_embed, pos_embed, labels in tqdm(dataloader_test, desc='Predicting'):
        with torch.no_grad():
            logits = bert_llr_model(cls_vectors = cls_vector.cuda(),
                                    neg_embed = neg_embed.cuda(),
                                    pos_embed = pos_embed.cuda())[0] #This is for generate predict only
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    preds = [j for i in predictions for j in i]
    preds = np.argmax(preds, axis=1)
    test_f1.append(f1_score(test.polarity, preds, average='macro'))
    test_accuracy.append(accuracy_score(test.polarity, preds))
    
    # Print result
    print('Train loss: ', loss_score)
    print('Train accuracy: ',acc_score)
    print(classification_report(test.polarity, preds, digits=4))
        

Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 146.91it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 891.17it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 809.83it/s]


Train loss:  0.03237039205795172
Train accuracy:  0.99588
              precision    recall  f1-score   support

           0     0.9365    0.9309    0.9337     12500
           1     0.9313    0.9369    0.9341     12500

   micro avg     0.9339    0.9339    0.9339     25000
   macro avg     0.9339    0.9339    0.9339     25000
weighted avg     0.9339    0.9339    0.9339     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 148.19it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 896.28it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 913.68it/s]


Train loss:  0.015601529313760238
Train accuracy:  0.99596
              precision    recall  f1-score   support

           0     0.9373    0.9307    0.9340     12500
           1     0.9312    0.9378    0.9345     12500

   micro avg     0.9342    0.9342    0.9342     25000
   macro avg     0.9343    0.9342    0.9342     25000
weighted avg     0.9343    0.9342    0.9342     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 148.45it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 872.93it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 992.66it/s]


Train loss:  0.014916918573570952
Train accuracy:  0.99632
              precision    recall  f1-score   support

           0     0.9379    0.9304    0.9341     12500
           1     0.9310    0.9384    0.9347     12500

   micro avg     0.9344    0.9344    0.9344     25000
   macro avg     0.9344    0.9344    0.9344     25000
weighted avg     0.9344    0.9344    0.9344     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 146.02it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 867.03it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 875.57it/s]


Train loss:  0.014337929889264201
Train accuracy:  0.99668
              precision    recall  f1-score   support

           0     0.9385    0.9307    0.9346     12500
           1     0.9313    0.9390    0.9351     12500

   micro avg     0.9348    0.9348    0.9348     25000
   macro avg     0.9349    0.9348    0.9348     25000
weighted avg     0.9349    0.9348    0.9348     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 150.83it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 977.18it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 945.34it/s]


Train loss:  0.01392129101979374
Train accuracy:  0.99676
              precision    recall  f1-score   support

           0     0.9387    0.9308    0.9347     12500
           1     0.9314    0.9392    0.9353     12500

   micro avg     0.9350    0.9350    0.9350     25000
   macro avg     0.9350    0.9350    0.9350     25000
weighted avg     0.9350    0.9350    0.9350     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:11<00:00, 136.84it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 821.69it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 913.49it/s]


Train loss:  0.013439818499317859
Train accuracy:  0.9968
              precision    recall  f1-score   support

           0     0.9392    0.9306    0.9349     12500
           1     0.9313    0.9398    0.9355     12500

   micro avg     0.9352    0.9352    0.9352     25000
   macro avg     0.9352    0.9352    0.9352     25000
weighted avg     0.9352    0.9352    0.9352     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 150.01it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 968.46it/s]
Predicting: 100%|████████████████████████████████████████| 1563/1563 [00:01<00:00, 1008.38it/s]


Train loss:  0.013197794522653957
Train accuracy:  0.99696
              precision    recall  f1-score   support

           0     0.9390    0.9309    0.9349     12500
           1     0.9315    0.9395    0.9355     12500

   micro avg     0.9352    0.9352    0.9352     25000
   macro avg     0.9352    0.9352    0.9352     25000
weighted avg     0.9352    0.9352    0.9352     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:11<00:00, 136.65it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 824.12it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 891.93it/s]


Train loss:  0.012920671569866319
Train accuracy:  0.99704
              precision    recall  f1-score   support

           0     0.9391    0.9307    0.9349     12500
           1     0.9313    0.9396    0.9354     12500

   micro avg     0.9352    0.9352    0.9352     25000
   macro avg     0.9352    0.9352    0.9352     25000
weighted avg     0.9352    0.9352    0.9352     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 151.77it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 867.76it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 904.05it/s]


Train loss:  0.012620894917628358
Train accuracy:  0.9972
              precision    recall  f1-score   support

           0     0.9394    0.9306    0.9350     12500
           1     0.9313    0.9399    0.9356     12500

   micro avg     0.9353    0.9353    0.9353     25000
   macro avg     0.9353    0.9353    0.9353     25000
weighted avg     0.9353    0.9353    0.9353     25000



Training: 100%|███████████████████████████████████████████| 1563/1563 [00:10<00:00, 147.44it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 960.28it/s]
Predicting: 100%|█████████████████████████████████████████| 1563/1563 [00:01<00:00, 925.05it/s]


Train loss:  0.01231458126993341
Train accuracy:  0.9972
              precision    recall  f1-score   support

           0     0.9393    0.9304    0.9348     12500
           1     0.9311    0.9398    0.9354     12500

   micro avg     0.9351    0.9351    0.9351     25000
   macro avg     0.9352    0.9351    0.9351     25000
weighted avg     0.9352    0.9351    0.9351     25000



In [15]:
print(train_loss)
print(train_accuracy)
print(test_f1)
print(test_accuracy)

[0.03237039205795172, 0.015601529313760238, 0.014916918573570952, 0.014337929889264201, 0.01392129101979374, 0.013439818499317859, 0.013197794522653957, 0.012920671569866319, 0.012620894917628358, 0.01231458126993341]
[0.99588, 0.99596, 0.99632, 0.99668, 0.99676, 0.9968, 0.99696, 0.99704, 0.9972, 0.9972]
[0.9338794049146442, 0.9342391851972003, 0.9343989503832062, 0.9348388939293212, 0.9349988533797737, 0.9351986525467015, 0.9351987906539108, 0.935158721744977, 0.9352786065742882, 0.9351185545452054]
[0.93388, 0.93424, 0.9344, 0.93484, 0.935, 0.9352, 0.9352, 0.93516, 0.93528, 0.93512]


## Test new method

In [9]:
label_list = list(label_dict.keys())

result = nlp.get_label_term_weighting(FOLDER_PATH+'tmunlp_file.txt', label_list)
VECTOR_LEN = 70
total_word_dict = {}

for label in label_list:
    word_dict = nlp.get_keyword(label, result, VECTOR_LEN)
    total_word_dict.update(word_dict)
    
def embedding_1hot(df, words):
    arr = []
    for i in trange(df.shape[0]):
        temp = [words[word] if word in df.sentence[i] else 0 for word in words.keys()]
        arr.append(temp)
    return arr

train['llr_vector'] = embedding_1hot(train, total_word_dict)
test['llr_vector'] = embedding_1hot(test, total_word_dict)

100%|███████████████████████████████████████████████████| 11671/11671 [00:59<00:00, 195.54it/s]
100%|███████████████████████████████████████████████████| 35604/35604 [03:03<00:00, 194.41it/s]


In [14]:
len(total_word_dict)

560

In [10]:
class BertLLR(nn.Module):
    def __init__(self, VECTOR_LEN, num_labels=2):
        super(BertLLR, self).__init__()
        self.bert = nn.Linear(768,768)
        self.bert_activation = nn.Tanh()
        
        self.llr = nn.Linear(len(label_list)*VECTOR_LEN, len(label_list)*VECTOR_LEN)
        self.llr_activation = nn.Tanh()
        
        self.dropout = nn.Dropout(0.1)
        
        self.classifier = nn.Linear(768+len(label_list)*VECTOR_LEN, num_labels)
    
    def forward(self, cls_vectors=None, llr_vectors=None, labels=None):
        tanh_cls_vectors = self.bert_activation(self.bert(cls_vectors))
        tanh_llr_vectors = self.llr_activation(self.llr(llr_vectors))
        
        concat_vectors = torch.cat([tanh_cls_vectors, tanh_llr_vectors], dim=1)
        concat_vectors = self.dropout(concat_vectors)
        
        logits = self.classifier(concat_vectors)
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 8), labels.view(-1))
            outputs = (logits, loss)
        else:
            outputs = (logits,)
        
        return outputs
    
bert_llr_model = BertLLR(VECTOR_LEN, num_labels=bert_model.model.config.num_labels)
bert_llr_optimizer = torch.optim.Adam(params = bert_llr_model.parameters(), 
                                      lr = 1e-5
                                     )


bert_llr_model   

BertLLR(
  (bert): Linear(in_features=768, out_features=768, bias=True)
  (bert_activation): Tanh()
  (llr): Linear(in_features=560, out_features=560, bias=True)
  (llr_activation): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=1328, out_features=8, bias=True)
)

In [11]:
dataloader = DataLoader(TensorDataset(torch.tensor(cls_vectors, dtype=torch.float),
                                      torch.tensor(train.llr_vector, dtype=torch.float),
                                      torch.tensor(train.polarity)
                                     ),
                        batch_size=16
                       )

dataloader_test = DataLoader(TensorDataset(torch.tensor(cls_vectors_test, dtype=torch.float),
                                          torch.tensor(test.llr_vector, dtype=torch.float),
                                          torch.tensor(test.polarity)
                                         ),
                            batch_size=16
                           )


bert_llr_model.to('cuda')
train_loss = []
train_accuracy = []
test_f1 = []
test_accuracy = []
for _ in range(10):
    # Training
    bert_llr_model.train()
    tr_loss = []
    for cls_vector, llr_vector, labels in tqdm(dataloader, desc='Training'):
        #print(labels)
        bert_llr_optimizer.zero_grad()
        logits, loss = bert_llr_model(cls_vectors=cls_vector.cuda(), 
                                      llr_vectors=llr_vector.cuda(), 
                                      labels=labels.cuda())
        loss.backward()
        bert_llr_optimizer.step()
        tr_loss.append(loss.item())
    loss_score = sum(tr_loss)/len(tr_loss)
    train_loss.append(loss_score)
    
    # Evaluation
    bert_llr_model.eval()
    predictions = []
    for cls_vector, llr_vector, labels in tqdm(dataloader, desc='Predicting'):
        with torch.no_grad():
            logits = bert_llr_model(cls_vectors=cls_vector.cuda(),
                                    llr_vectors=llr_vector.cuda())[0] #This is for generate predict only
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    predictions = [j for i in predictions for j in i]
    predictions = np.argmax(predictions, axis=1)
    acc_score = accuracy_score(train.polarity, predictions)
    train_accuracy.append(acc_score)
    
    # Predict data
    predictions = []
    for cls_vector, llr_vector, labels in tqdm(dataloader_test, desc='Predicting'):
        with torch.no_grad():
            logits = bert_llr_model(cls_vectors=cls_vector.cuda(),
                                    llr_vectors=llr_vector.cuda())[0] #This is for generate predict only
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    preds = [j for i in predictions for j in i]
    preds = np.argmax(preds, axis=1)
    test_f1.append(f1_score(test.polarity, preds, average='macro'))
    test_accuracy.append(accuracy_score(test.polarity, preds))
    
    # Print result
    print('Train loss: ', loss_score)
    print('Train accuracy: ',acc_score)
    print(classification_report(test.polarity, preds, digits=4))
        

Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 134.00it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1110.72it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:01<00:00, 1164.73it/s]


Train loss:  0.6860183621922584
Train accuracy:  0.929397652300574
              precision    recall  f1-score   support

           0     0.5596    0.5772    0.5683      4326
           1     0.3584    0.9104    0.5143      1473
           2     0.4874    0.9803    0.6510      1573
           3     0.6234    0.5285    0.5720      7344
           4     0.8665    0.6030    0.7111     18266
           5     0.4063    0.9240    0.5645      1526
           6     0.6146    0.9473    0.7455       835
           7     0.3567    0.7395    0.4813       261

   micro avg     0.6367    0.6367    0.6367     35604
   macro avg     0.5341    0.7763    0.6010     35604
weighted avg     0.7119    0.6367    0.6471     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 168.91it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1082.48it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:01<00:00, 1130.69it/s]


Train loss:  0.25122400152764907
Train accuracy:  0.9392511352926056
              precision    recall  f1-score   support

           0     0.5711    0.5608    0.5659      4326
           1     0.3370    0.9253    0.4940      1473
           2     0.5051    0.9784    0.6662      1573
           3     0.6120    0.5327    0.5696      7344
           4     0.8721    0.5818    0.6980     18266
           5     0.4036    0.9318    0.5633      1526
           6     0.5762    0.9557    0.7189       835
           7     0.2969    0.8851    0.4447       261

   micro avg     0.6269    0.6269    0.6269     35604
   macro avg     0.5218    0.7940    0.5901     35604
weighted avg     0.7123    0.6269    0.6385     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 154.91it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1060.25it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1063.68it/s]


Train loss:  0.2046105886873317
Train accuracy:  0.944049353097421
              precision    recall  f1-score   support

           0     0.5763    0.5564    0.5662      4326
           1     0.3275    0.9321    0.4846      1473
           2     0.5101    0.9790    0.6707      1573
           3     0.6102    0.5344    0.5698      7344
           4     0.8755    0.5727    0.6925     18266
           5     0.4027    0.9384    0.5636      1526
           6     0.5597    0.9605    0.7072       835
           7     0.2790    0.9042    0.4264       261

   micro avg     0.6229    0.6229    0.6229     35604
   macro avg     0.5176    0.7972    0.5851     35604
weighted avg     0.7136    0.6229    0.6351     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 164.17it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1173.13it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:01<00:00, 1131.45it/s]


Train loss:  0.184876923895862
Train accuracy:  0.948761888441436
              precision    recall  f1-score   support

           0     0.5809    0.5534    0.5668      4326
           1     0.3229    0.9362    0.4802      1473
           2     0.5130    0.9797    0.6734      1573
           3     0.6103    0.5362    0.5708      7344
           4     0.8766    0.5682    0.6895     18266
           5     0.4025    0.9450    0.5645      1526
           6     0.5499    0.9641    0.7003       835
           7     0.2762    0.9195    0.4248       261

   micro avg     0.6212    0.6212    0.6212     35604
   macro avg     0.5165    0.8003    0.5838     35604
weighted avg     0.7144    0.6212    0.6337     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 158.68it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1144.07it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1090.80it/s]


Train loss:  0.17203001229934498
Train accuracy:  0.951846457030246
              precision    recall  f1-score   support

           0     0.5807    0.5525    0.5662      4326
           1     0.3210    0.9382    0.4784      1473
           2     0.5150    0.9803    0.6753      1573
           3     0.6086    0.5366    0.5704      7344
           4     0.8775    0.5651    0.6875     18266
           5     0.4037    0.9502    0.5666      1526
           6     0.5439    0.9653    0.6957       835
           7     0.2771    0.9310    0.4271       261

   micro avg     0.6200    0.6200    0.6200     35604
   macro avg     0.5159    0.8024    0.5834     35604
weighted avg     0.7144    0.6200    0.6325     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 150.31it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1111.45it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1079.24it/s]


Train loss:  0.16206110637788088
Train accuracy:  0.9548453431582555
              precision    recall  f1-score   support

           0     0.5808    0.5550    0.5676      4326
           1     0.3181    0.9430    0.4758      1473
           2     0.5162    0.9828    0.6769      1573
           3     0.6095    0.5342    0.5694      7344
           4     0.8771    0.5620    0.6851     18266
           5     0.4048    0.9522    0.5681      1526
           6     0.5373    0.9665    0.6906       835
           7     0.2779    0.9349    0.4284       261

   micro avg     0.6187    0.6187    0.6187     35604
   macro avg     0.5152    0.8038    0.5827     35604
weighted avg     0.7143    0.6187    0.6312     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:04<00:00, 155.56it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1145.55it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1059.87it/s]


Train loss:  0.15470511343023957
Train accuracy:  0.958101276668666
              precision    recall  f1-score   support

           0     0.5819    0.5550    0.5681      4326
           1     0.3200    0.9470    0.4784      1473
           2     0.5165    0.9835    0.6773      1573
           3     0.6079    0.5357    0.5695      7344
           4     0.8780    0.5605    0.6842     18266
           5     0.4043    0.9541    0.5680      1526
           6     0.5347    0.9677    0.6888       835
           7     0.2776    0.9349    0.4281       261

   micro avg     0.6186    0.6186    0.6186     35604
   macro avg     0.5151    0.8048    0.5828     35604
weighted avg     0.7145    0.6186    0.6309     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:03<00:00, 187.67it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1142.86it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1077.27it/s]


Train loss:  0.1471286101241226
Train accuracy:  0.9595578785022706
              precision    recall  f1-score   support

           0     0.5818    0.5525    0.5668      4326
           1     0.3193    0.9491    0.4779      1473
           2     0.5162    0.9841    0.6772      1573
           3     0.6073    0.5357    0.5692      7344
           4     0.8778    0.5586    0.6827     18266
           5     0.4031    0.9554    0.5670      1526
           6     0.5315    0.9689    0.6865       835
           7     0.2776    0.9349    0.4281       261

   micro avg     0.6175    0.6175    0.6175     35604
   macro avg     0.5143    0.8049    0.5819     35604
weighted avg     0.7141    0.6175    0.6298     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:05<00:00, 142.78it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1082.75it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:01<00:00, 1149.46it/s]


Train loss:  0.14172933775013033
Train accuracy:  0.9612715277182761
              precision    recall  f1-score   support

           0     0.5815    0.5525    0.5666      4326
           1     0.3186    0.9518    0.4774      1473
           2     0.5153    0.9847    0.6766      1573
           3     0.6077    0.5343    0.5687      7344
           4     0.8775    0.5575    0.6818     18266
           5     0.4039    0.9574    0.5682      1526
           6     0.5288    0.9689    0.6841       835
           7     0.2773    0.9349    0.4277       261

   micro avg     0.6168    0.6168    0.6168     35604
   macro avg     0.5138    0.8052    0.5814     35604
weighted avg     0.7139    0.6168    0.6291     35604



Training: 100%|█████████████████████████████████████████████| 730/730 [00:05<00:00, 144.24it/s]
Predicting: 100%|██████████████████████████████████████████| 730/730 [00:00<00:00, 1128.35it/s]
Predicting: 100%|████████████████████████████████████████| 2226/2226 [00:02<00:00, 1038.51it/s]


Train loss:  0.1361000793462951
Train accuracy:  0.9625567646302802
              precision    recall  f1-score   support

           0     0.5837    0.5532    0.5680      4326
           1     0.3189    0.9538    0.4780      1473
           2     0.5151    0.9847    0.6764      1573
           3     0.6067    0.5349    0.5685      7344
           4     0.8769    0.5555    0.6802     18266
           5     0.4031    0.9587    0.5676      1526
           6     0.5277    0.9701    0.6835       835
           7     0.2778    0.9387    0.4287       261

   micro avg     0.6162    0.6162    0.6162     35604
   macro avg     0.5137    0.8062    0.5814     35604
weighted avg     0.7136    0.6162    0.6284     35604



In [12]:
print(train_loss)
print(train_accuracy)
print(test_f1)
print(test_accuracy)

[0.6860183621922584, 0.25122400152764907, 0.2046105886873317, 0.184876923895862, 0.17203001229934498, 0.16206110637788088, 0.15470511343023957, 0.1471286101241226, 0.14172933775013033, 0.1361000793462951]
[0.929397652300574, 0.9392511352926056, 0.944049353097421, 0.948761888441436, 0.951846457030246, 0.9548453431582555, 0.958101276668666, 0.9595578785022706, 0.9612715277182761, 0.9625567646302802]
[0.6010029888971391, 0.5900777530863088, 0.5851235517798916, 0.5837785103884101, 0.5833905851589674, 0.5827412881051215, 0.5828136081460616, 0.5819087260351077, 0.5813863297486113, 0.58136705646002]
[0.6367262105381418, 0.6268677676665543, 0.6228513650151668, 0.6211942478373217, 0.6200426918323784, 0.6187226154364678, 0.6185821817773284, 0.617458712504213, 0.6168408044039996, 0.6162228963037861]
