In [1]:
! pip install transformers



In [16]:
from transformers import BertModel
from transformers import BertTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
from torch.utils.data import Dataset
from time import time
import random
import numpy as np

# 一个简单的BERT处理输入的例子

In [17]:
# 加载预训练BERT模型和分词器
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 分词
sentence = 'I really enjoyed this movie a lot.'
tokens = tokenizer.tokenize(sentence)
print(tokens)
# Out: ['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']

# Token embedding: 添加开头和结尾的tokens
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']

# 进行Padding保证所有输入的文本长度相同
MAX_LEN = 12
padded_tokens = tokens + ['[PAD]' for _ in range(MAX_LEN - len(tokens))]
print(padded_tokens)

# 用0标注告诉BERT哪些是PAD
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(attn_mask)
# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

# segment embedding
seg_ids = [0 for _ in range(len(padded_tokens))] #Since we only have a single sequence as input

# positional embedding: 获取tokens在vocabulary中的id
sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print(sent_ids)

# Converting everything to torch tensors before feeding them to bert_model
sent_ids = torch.tensor(sent_ids).unsqueeze(0) #Shape : [1, 12]
attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
seg_ids   = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]

#Feed them to bert
hidden_reps, cls_head = bert_model(sent_ids, attention_mask = attn_mask,\
                                  token_type_ids = seg_ids)

['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']
['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']
['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[101, 1045, 2428, 5632, 2023, 3185, 1037, 2843, 1012, 102, 0, 0]


# 数据类，用于加载csv

In [18]:
class OFFdataset(Dataset):
    def __init__(self, dataframe, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = dataframe

        self.mapping = {'OFF': 0, 'NOT': 1, 'TIN': 0, 'UNT': 1, 'IND': 0, 'GRP': 1, 'OTH': 2}
        
        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df['text'][index]
        label = self.df.iloc[:,2][index]
        label =  torch.tensor(int(self.mapping[label]))

        #Preprocessing the text to be suitable for BERT
        try:
          tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        except:
          #  避免文本为nan值报错
          tokens = []
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
        
        return tokens_ids_tensor, attn_mask, label

# 指定task，准备数据，分配sampler权重

In [19]:
# 指定task和文件名
path = r'C:\Users\Mukuu\Desktop\MSc Proj'
#TASK = 'c'
#data_df = pd.read_csv(path + 'final_train_%s.csv' % TASK)[['tweet','subtask_%s' % TASK]]
#test_df = pd.read_csv(path + 'final_test_%s.csv' % TASK)

#data_df = pd.read_csv(path + '\sample2_cleaned.csv')
#test_df = pd.read_csv(path + '\sample2_cleaned_test.csv')
# task_a_distant_cleaned.csv
data_df = pd.read_csv(path + r'\task_b_distant_cleaned.csv')
test_df = pd.read_csv(path + r'\task_a_distant_cleaned_test.csv')

# 定义类名和数字的映射关系
mapping = {'OFF': 0, 'NOT': 1, 'TIN': 0, 'UNT': 1, 'IND': 0, 'GRP': 1, 'OTH': 2}
        
train_split = 0.995  # Defines the ratio of train/valid
train_size = int(len(data_df) * train_split)
valid_size = int(len(data_df) * (1-train_split))
print('train size:',train_size)
train_df = data_df[:train_size].reset_index(drop=True)
valid_df = data_df[train_size:].reset_index(drop=True)
#Creating instances of training and validation set
train_set = OFFdataset(train_df, maxlen = 30)
val_set   = OFFdataset(valid_df, maxlen = 30)
test_set  = OFFdataset(test_df, maxlen = 30)

# adjust the weight for unbalanced class distribution
train_batch_size = 32
#class_count = train_df['subtask_%s' % TASK].value_counts()
# weights = 1 / torch.Tensor([class_count[0],class_count[1]])
#weights = 1 / torch.Tensor([cnt for cnt in class_count])
# sample_weights = train_df['subtask_%s' % TASK].map(lambda x: weights[1] if x == 1 else weights[0])
#sample_weights = train_df['subtask_%s' % TASK].map(lambda x: weights[mapping[x]])
#sample_weights = torch.tensor(sample_weights)
#sampler = torch.utils.data.sampler.WeightedRandomSampler(sample_weights, len(sample_weights))
# trainloader = data_utils.DataLoader(train_dataset, batch_size = batch_size, shuffle=True, sampler = sampler)

# 要使用重新分配权重的版本，在train_loader的参数重添加 sampler=sampler
#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = train_batch_size, num_workers = 0)
val_loader =   DataLoader(val_set, batch_size = 64, num_workers = 0)
test_loader =  DataLoader(test_set, batch_size = 64, num_workers = 0)

train size: 188029


# 合并预训练的BERT和输出用全连层，准备进行Fine-tuning

In [20]:
# 为BERT端添加用于最后输出的全连接层以构成分类器
class SentimentClassifier(nn.Module):
    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                if (random.random()>0.3):
                    p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1536)
        self.dp1 = nn.Dropout(0.3)
        self.cls_layer2 = nn.Linear(4096, 2048)
        self.dp2 = nn.Dropout(0.3)
        self.bn = nn.BatchNorm1d(768, momentum=0.01)
        self.bn2 = nn.BatchNorm1d(1536, momentum=0.01)
        self.cls_layer3 = nn.Linear(1536, 3)
        

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        logits = cont_reps[:, 0]
        # logits = self.dp1(logits)
        #Feeding cls_rep to the classifier layer
        # cls_rep = self.bn(cls_rep)
        logits = self.cls_layer(logits)
        
        # # logits = self.cls_layer2(logits)
        logits = self.bn2(logits)
        # logits = self.dp2(logits)
        
        logits = self.cls_layer3(logits)
        return logits

# 备选结构

In [21]:
# 为BERT端添加用于最后输出的全连接层以构成分类器
from torch.nn.utils.rnn import pack_padded_sequence
class BertLSTM(nn.Module):
    def __init__(self, freeze_bert = True):
        super(BertLSTM, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                if (random.random()>0.3):
                    p.requires_grad = False
        
        self.embed_size = 768
        self.hidden_size = 512
        self.num_layers = 1
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1536)
        self.dp1 = nn.Dropout(0.3)
        self.cls_layer2 = nn.Linear(4096, 2048)
        self.dp2 = nn.Dropout(0.5)
        self.cls_layer3 = nn.Linear(self.hidden_size * 2, 2)

        
        self.lstm = nn.LSTM(self.embed_size, self.hidden_size, self.num_layers, bidirectional=True, batch_first=True)
        

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        with torch.no_grad():
            logits, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        # logits = self.dp1(cont_reps)

        _,(logits,_) = self.lstm(logits)
        logits = self.dp1(
            torch.cat((logits[-2, :, :], logits[-1, :, :]), dim=1))
        logits = self.cls_layer3(logits)
        return logits

In [22]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net = SentimentClassifier(freeze_bert = False)
net = net.to(device)
# 指定损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001,momentum=0.9)

In [23]:
import copy
best_model_acc = copy.deepcopy(net)
best_model_loss = copy.deepcopy(net)
best_acc = 0
best_loss = 1

In [24]:
device

device(type='cuda', index=0)

In [25]:
import time

# 开始训练

In [26]:
start = time.time()
epochs = 1
for ep in range(epochs):
    correct = 0
    train_loss_sum = 0
    for it, (seq, attn_masks, labels) in enumerate(train_loader):
        #Clear gradients
        optimizer.zero_grad()
        #Converting these to cuda tensors
        seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()

        #Obtaining the logits from the model
        logits = net(seq, attn_masks)

        #Computing loss
        loss = criterion(logits, labels)
        train_loss_sum += loss.item()

        #Backpropagating the gradients
        loss.backward()

        #Optimization step
        optimizer.step()
        logits[logits>0.0] = 0.0
        logits[logits<0.0] = 1.0
        correct += torch.sum(logits[:,0] == labels)
        if (it + 1.01) % (train_size/train_batch_size/10) < 1:
            acc = correct.float()/((it+1)*train_batch_size)
            print('time:',time.time()- start)
            print("\rIteration {} of epoch {}. Loss : {} Accuracy : {}".format(it+1, ep+1, train_loss_sum/(it+1), acc))
            val_correct = 0
            val_loss_sum = 0
            total_valid = 0
            for it2, (seq2, attn_masks2, labels2) in enumerate(val_loader):
                seq2, attn_masks2, labels2 = seq2.cuda(), attn_masks2.cuda(), labels2.cuda()
                val_outputs = net(seq2, attn_masks2)
                val_loss = criterion(val_outputs, labels2)
                val_loss_sum += val_loss.item()
                _, val_preds = torch.max(val_outputs, 1)
                val_correct += torch.sum(val_preds == labels2)
                # val_outputs[val_outputs>-0.1] = 0.0
                # val_outputs[val_outputs<-0.1] = 1.0
                total_valid += labels2.size(0)
                # val_correct += torch.sum(val_outputs[:,0] == labels2)
                
            val_epoch_acc = val_correct.float() / total_valid
            val_epoch_loss = val_loss_sum/ (total_valid/64)
            if val_epoch_acc > best_acc:
                best_acc = val_epoch_acc
                best_model_acc = copy.deepcopy(net)
            if val_epoch_loss < best_loss:
                best_loss = val_epoch_loss
                best_model_loss = copy.deepcopy(net)
            print("\r\t\t\t\t\t\t\t\t\t\t\t\tValLoss : {} ValAcc : {}".format(val_epoch_loss, val_epoch_acc))
duration = time.time() - start
duration

time: 79.27011585235596
Iteration 588 of epoch 1. Loss : 0.5858968700580045 Accuracy : 0.7943770885467529
												ValLoss : 0.4459746951148624 ValAcc : 0.8201058506965637
time: 169.4553325176239
Iteration 1176 of epoch 1. Loss : 0.4871995422963788 Accuracy : 0.8073979616165161
												ValLoss : 0.37317042981506027 ValAcc : 0.8433862328529358
time: 259.14591693878174
Iteration 1763 of epoch 1. Loss : 0.44546877965964987 Accuracy : 0.8076964020729065
												ValLoss : 0.37139619292405546 ValAcc : 0.8232804536819458
time: 356.86969327926636
Iteration 2351 of epoch 1. Loss : 0.4207242337732506 Accuracy : 0.8071299195289612
												ValLoss : 0.3500153314499628 ValAcc : 0.8455026745796204
time: 450.5688166618347
Iteration 2938 of epoch 1. Loss : 0.40217939846171985 Accuracy : 0.8074583411216736
												ValLoss : 0.3855709237396402 ValAcc : 0.7968254089355469
time: 542.6551461219788
Iteration 3526 of epoch 1. Loss : 0.3892349101945344 Accuracy : 0.8070316910743713
								

898.8251445293427

In [27]:
torch.max(val_outputs, 1)[1],labels2

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0], device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
         0], device='cuda:0'))

In [28]:
test_df = pd.read_csv(path + r'\test_b_tweets_cleaned.csv')
test_set  = OFFdataset(test_df, maxlen = 30)
test_loader =  DataLoader(test_set, batch_size = 64, num_workers = 0)

# 测试best model在test set上的成绩

In [29]:
output_data = np.array([])
val_correct = 0
val_loss = 0
total_valid = 0
true_label = np.array([])
TP,FP,TN,FN =0,0,0,0

for it2, (seq2, attn_masks2, labels2) in enumerate(test_loader):
    seq2, attn_masks2, labels2 = seq2.cuda(), attn_masks2.cuda(), labels2.cuda()
    true_label= np.append(true_label,labels2.cpu().detach().numpy())
    val_outputs = best_model_acc(seq2, attn_masks2)
    val_loss1 = criterion(val_outputs, labels2)
    val_loss += val_loss1.item()
    val_outputs[:,0] += 0.0
    # val_outputs[val_outputs>-0.0] = 0.0
    # val_outputs[val_outputs<-0.0] = 1.0
    _, val_preds = torch.max(val_outputs, 1)
    val_correct += torch.sum(val_preds == labels2)
    valans = val_preds
    
    total_valid += labels2.size(0)
    TP += torch.sum(valans[valans==1.0] == labels2[valans==1.0]).item()
    FP += torch.sum(valans[valans==1.0] != labels2[valans==1.0]).item()
    TN += torch.sum(valans[valans==0.0] == labels2[valans==0.0]).item()
    FN += torch.sum(valans[valans==0.0] != labels2[valans==0.0]).item()

    output_data=np.append(output_data, valans.cpu().detach().numpy())

print('TP:%d, FP:%d, TN:%d, FN:%d' % (TP,FP,TN,FN))

p_recall = TP/(TP+FN)
n_recall = TN/(TN+FP) 

p_precision = TP/(TP+FP)
n_precision = TN/(TN+FN) 

p_f1 = 2*TP/(2*TP+FP+FN)
n_f1 = 2*TN/(2*TN+FN+FP) 

print('Recall(P):%.3f, Recall(N):%.3f, Recall-macro:%.3f '% (p_recall, n_recall,(p_recall + n_recall)/2))
print('Precision(P):%.3f, Precision(N):%.3f, Precision-macro:%.3f '% (p_precision, n_precision,(p_precision+n_precision)/2))
print('F1(P):%.3f, F1(N):%.3f, F1-macro:%.3f' % (p_f1, n_f1, (p_f1+n_f1)/2))
val_epoch_acc = val_correct.float() / total_valid
val_epoch_loss = val_loss/ (total_valid/64)
print("Test Loss : %.3f Test acc : %.3f" % (val_epoch_loss, val_epoch_acc))

TP:185, FP:82, TN:768, FN:387
Recall(P):0.323, Recall(N):0.904, Recall-macro:0.613 
Precision(P):0.693, Precision(N):0.665, Precision-macro:0.679 
F1(P):0.441, F1(N):0.766, F1-macro:0.604
Test Loss : 0.978 Test acc : 0.670


In [32]:
len(true_label)
from sklearn.metrics import confusion_matrix

In [33]:
cm = confusion_matrix(true_label, output_data)
print("confusion matrix->\n ", confusion_matrix(true_label, output_data))
target_names = ['OFF', 'NOT']
plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True)

confusion matrix->
  [[768  82]
 [387 185]]


NameError: name 'plot_confusion_matrix' is not defined

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

output_data = np.array([])
val_correct = 0
val_loss = 0
total_valid = 0
TP,FP,TN,FN =0,0,0,0
for it2, (seq2, attn_masks2, labels2) in enumerate(test_loader):
    seq2, attn_masks2, labels2 = seq2.cuda(), attn_masks2.cuda(), labels2.cuda()
    val_outputs = best_model_acc(seq2, attn_masks2)
    val_loss1 = criterion(val_outputs, labels2)
    val_loss += val_loss1.item()
    val_outputs[:,0] += 0.0
    # val_outputs[val_outputs>-0.0] = 0.0
    # val_outputs[val_outputs<-0.0] = 1.0
    _, val_preds = torch.max(val_outputs, 1)
    val_correct += torch.sum(val_preds == labels2)
    valans = val_preds
    
    total_valid += labels2.size(0)

    output_data=np.append(output_data, valans.cpu().detach().numpy())


ANS_PATH = path + 'labels-level%s.csv' % TASK
answer = pd.read_csv(ANS_PATH, header=None)
mapping = {'a': {'OFF': 1, 'NOT': 0}, 'b': {'TIN': 1, 'UNT': 0}, 'c': {'IND': 0, 'GRP': 1, 'OTH': 2}}
answer[1] = answer[1].map(lambda x: mapping[TASK][x])

print("Accuracy Score -> ", accuracy_score(answer[1], output_data))
print("precision Score -> ", precision_score(answer[1], output_data, average='macro'))
print("recall Split -> ", recall_score(answer[1], output_data, average=None))
print("recall Macro -> ", recall_score(answer[1], output_data, average='macro'))
print("F1-Split -> ", f1_score(answer[1], output_data, average=None))
print("F1-Macro -> ", f1_score(answer[1], output_data, average='macro'))
print("confusion matrix->\n ", confusion_matrix(answer[1], output_data))
val_epoch_acc = val_correct.float() / total_valid
val_epoch_loss = val_loss/ (total_valid/64)
print("Test Loss : %.3f Test acc : %.3f" % (val_epoch_loss, val_epoch_acc))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

output_data = np.array([])
val_correct = 0
val_loss = 0
total_valid = 0
TP,FP,TN,FN =0,0,0,0
for it2, (seq2, attn_masks2, labels2) in enumerate(test_loader):
    seq2, attn_masks2, labels2 = seq2.cuda(), attn_masks2.cuda(), labels2.cuda()
    val_outputs = best_model_loss(seq2, attn_masks2)
    val_loss1 = criterion(val_outputs, labels2)
    val_loss += val_loss1.item()
    val_outputs[:,0] += 0.0
    # val_outputs[val_outputs>-0.0] = 0.0
    # val_outputs[val_outputs<-0.0] = 1.0
    _, val_preds = torch.max(val_outputs, 1)
    val_correct += torch.sum(val_preds == labels2)
    valans = val_preds
    
    total_valid += labels2.size(0)

    output_data=np.append(output_data, valans.cpu().detach().numpy())


ANS_PATH = path + 'labels-level%s.csv' % TASK
answer = pd.read_csv(ANS_PATH, header=None)
mapping = {'a': {'OFF': 1, 'NOT': 0}, 'b': {'TIN': 1, 'UNT': 0}, 'c': {'IND': 0, 'GRP': 1, 'OTH': 2}}
answer[1] = answer[1].map(lambda x: mapping[TASK][x])

print("Accuracy Score -> ", accuracy_score(answer[1], output_data))
print("precision Score -> ", precision_score(answer[1], output_data, average='macro'))
print("recall Split -> ", recall_score(answer[1], output_data, average=None))
print("recall Macro -> ", recall_score(answer[1], output_data, average='macro'))
print("F1-Split -> ", f1_score(answer[1], output_data, average=None))
print("F1-Macro -> ", f1_score(answer[1], output_data, average='macro'))
print("confusion matrix->\n ", confusion_matrix(answer[1], output_data))
val_epoch_acc = val_correct.float() / total_valid
val_epoch_loss = val_loss/ (total_valid/64)
print("Test Loss : %.3f Test acc : %.3f" % (val_epoch_loss, val_epoch_acc))

In [None]:
val_preds

In [None]:
output_data = np.array([])
val_correct = 0
val_loss = 0
total_valid = 0
TP,FP,TN,FN =0,0,0,0
for it2, (seq2, attn_masks2, labels2) in enumerate(test_loader):
    seq2, attn_masks2, labels2 = seq2.cuda(), attn_masks2.cuda(), labels2.cuda()
    val_outputs = best_model_loss(seq2, attn_masks2)
    val_loss1 = criterion(val_outputs, labels2)
    val_loss += val_loss1.item()
    val_outputs[:,1] += 0.0
    # val_outputs[val_outputs>-0.0] = 0.0
    # val_outputs[val_outputs<-0.0] = 1.1
    _, val_preds = torch.max(val_outputs, 1)
    val_correct += torch.sum(val_preds == labels2)
    valans = val_preds
    
    total_valid += labels2.size(0)
    TP += torch.sum(valans[valans==1.0] == labels2[valans==1.0]).item()
    FP += torch.sum(valans[valans==1.0] != labels2[valans==1.0]).item()
    TN += torch.sum(valans[valans==0.0] == labels2[valans==0.0]).item()
    FN += torch.sum(valans[valans==0.0] != labels2[valans==0.0]).item()

    output_data=np.append(output_data, valans.cpu().detach().numpy())

print('TP:%d, FP:%d, TN:%d, FN:%d' % (TP,FP,TN,FN))

p_recall = TP/(TP+FN)
n_recall = TN/(TN+FP) 

p_precision = TP/(TP+FP)
n_precision = TN/(TN+FN) 

p_f1 = 2*TP/(2*TP+FP+FN)
n_f1 = 2*TN/(2*TN+FN+FP) 

print('Recall(OFF):%.3f, Recall(NOT):%.3f, Recall-macro:%.3f '% (p_recall, n_recall,(p_recall + n_recall)/2))
print('Precision(OFF):%.3f, Precision(NOT):%.3f, Precision-macro:%.3f '% (p_precision, n_precision,(p_precision+n_precision)/2))
print('F1(OFF):%.3f, F1(NOT):%.3f, F1-macro:%.3f' % (p_f1, n_f1, (p_f1+n_f1)/2))
val_epoch_acc = val_correct.float() / total_valid
val_epoch_loss = val_loss/ (total_valid/32)
print("Test Loss : {} Test acc : {}".format(val_epoch_loss, val_epoch_acc))

In [None]:
labels2

In [None]:
val_outputs

tensor([[ 0.9579, -1.1993],
        [ 1.2467, -1.4037],
        [ 0.8473, -0.7620],
        [ 0.4997, -0.8071],
        [ 1.1288, -1.8218],
        [ 1.1589, -1.0631],
        [ 1.1304, -1.3127],
        [ 1.4880, -1.0801],
        [ 1.2937, -1.6105],
        [ 0.8625, -1.1630],
        [ 0.7311, -1.0373],
        [ 1.5489, -1.4207],
        [ 1.1483, -1.2670],
        [ 1.0335, -1.1892],
        [ 0.8731, -1.2020],
        [ 1.4522, -1.4780],
        [ 1.5092, -1.3331],
        [ 0.5293, -0.9135],
        [ 1.4824, -1.1390],
        [ 0.9275, -1.1968],
        [ 1.0774, -0.7347],
        [ 1.6984, -1.6076],
        [ 1.2736, -1.6458],
        [ 1.0814, -1.3105],
        [ 0.7948, -1.2855],
        [ 1.1271, -1.1959],
        [ 0.6966, -0.9912],
        [ 1.6566, -1.3208],
        [ 1.0835, -0.9662],
        [ 1.2925, -1.4433],
        [ 1.3297, -1.6926],
        [ 0.5157, -0.7049],
        [ 0.7713, -0.6838],
        [ 1.6383, -1.6433],
        [ 1.6215, -0.8151],
        [-0.3981, -0

输出预测csv

In [None]:
output_csv = pd.DataFrame(output_data,columns=['predicted'])
output_csv['predicted']=output_csv['predicted'].map(lambda x: 'OFF' if x==1 else 'NOT') 
output_csv.to_csv('output_bert_b.csv',index=None)

In [None]:
numDataPoints = 1000
data_dim = 5
bs = 100
 
# Create dummy data with class imbalance 9 to 1
data = torch.FloatTensor(numDataPoints, data_dim)
target = np.hstack((np.zeros(int(numDataPoints * 0.9), dtype=np.int32),
     np.ones(int(numDataPoints * 0.1), dtype=np.int32)))
 

class_sample_count = np.array(
 [len(np.where(target == t)[0]) for t in np.unique(target)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in target])
 
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
 
target = torch.from_numpy(target).long()
train_dataset = torch.utils.data.TensorDataset(data, target)
 
train_loader = DataLoader(
 train_dataset, batch_size=bs, num_workers=1, sampler=sampler)

In [None]:
sample_weights

tensor([0.0001, 0.0001, 0.0001,  ..., 0.0001, 0.0001, 0.0001])