<a href="https://colab.research.google.com/github/BBB-WU/NLP/blob/BBB-WU-patch-1/NLP_with_BERT_multi_class_text__classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
pip install transformers



In [57]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [58]:
import pandas as pd

In [59]:
import numpy as np


In [60]:
df = pd.read_csv('/content/train.tsv',sep='\t',header=None)

dt = pd.read_csv('/content/valid.tsv',sep='\t',header=None)

dtt = pd.read_csv('/content/test.tsv',sep='\t',header=None)

In [61]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [62]:
#去掉原来资料库的其他资料，留下label和text
df=df[[1,2]]
df.columns=['Tvalue','Title']

dt=dt[[1,2]]
dt.columns=['Tvalue','Title']

dtt=dtt[[1,2]]
dtt.columns=['Tvalue','Title']

In [63]:
#将六类label分成两类
label_dict = {'barely-true': 0,
 'false': 0,
 'half-true': 0,
 'mostly-true': 1,
 'pants-fire': 0,
 'true': 1}

In [64]:
df['label'] = df.Tvalue.replace(label_dict)
dt['label'] = dt.Tvalue.replace(label_dict)
dtt['label'] = dtt.Tvalue.replace(label_dict)

In [65]:
dt['label'].value_counts()

0    864
1    420
Name: label, dtype: int64

In [66]:
#为了让数据比较平均，选择去掉了两类。

df=df[ ~ df['Tvalue'].str.contains('half-true') ]
df=df[ ~ df['Tvalue'].str.contains('barely-true') ]

dt=dt[ ~ dt['Tvalue'].str.contains('half-true') ]
dt=dt[ ~ dt['Tvalue'].str.contains('barely-true') ]

dtt=dtt[ ~ dtt['Tvalue'].str.contains('barely-true') ]
dtt=dtt[ ~ dtt['Tvalue'].str.contains('barely-true') ]

In [67]:
df.head(10)

Unnamed: 0,Tvalue,Title,label
0,false,Says the Annies List political group supports ...,0
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",1
3,false,Health care reform legislation is likely to ma...,0
5,true,The Chicago Bears have had more starting quart...,1
9,mostly-true,Says GOP primary opponents Glenn Grothman and ...,1
10,mostly-true,"For the first time in history, the share of th...",1
12,false,When Mitt Romney was governor of Massachusetts...,0
13,mostly-true,The economy bled $24 billion due to the govern...,1
16,true,McCain opposed a requirement that the governme...,1
19,mostly-true,"Almost 100,000 people left Puerto Rico last year.",1


In [68]:
df['label'].value_counts()

1    3638
0    2834
Name: label, dtype: int64

In [69]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [70]:
#利用tokenizer，把句子变成Bert能处理的token形式。

encoded_data_train = tokenizer.batch_encode_plus(
    df.Title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=32, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    dt.Title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=32, 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    dt.Title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=32, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df.label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(dt.label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(dt.label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [71]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [72]:
len(dataset_train), len(dataset_val), len(dataset_test)

(6472, 799, 799)

In [73]:
#应用已经写好的model做二元分类。这个model是在bert-base上在加一个nn.linear做分类。

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [74]:
#把数据分成一个个batch组成的dataloader。

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [75]:
#optimizer应用AdamW，还有一个让learning rate逐渐变小的scheduler。
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-6, 
                  eps=1e-8)

In [76]:
epochs = 120

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


In [77]:
#加入f1_score来做model预测效果评估。

from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [78]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [79]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [80]:
#用来评估test的结果，会记录下每次预测的结果和正确率。

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [81]:
#这是model train 的部分，每个epoch后，都会把loss和f1_score显示。
#然后，每个epoch， train 完的model都会存下来。在test测试哪个更好。
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    if epoch%5==0:
      torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=203.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.6785140959500092
Validation loss: 0.6712562131881714
F1 Score (Weighted): 0.5418317519140671


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=203.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.6597746933622314
Validation loss: 0.6535996437072754
F1 Score (Weighted): 0.628871731578029


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=203.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.6480594885173102
Validation loss: 0.6437593364715576
F1 Score (Weighted): 0.6326691600763299


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=203.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.6391171058410494
Validation loss: 0.6358687591552734
F1 Score (Weighted): 0.6416584928078951


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=203.0, style=ProgressStyle(description_widt…


Epoch 5
Training loss: 0.6300435952952342
Validation loss: 0.631873025894165
F1 Score (Weighted): 0.6426786278017468


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=203.0, style=ProgressStyle(description_widt…


Epoch 6
Training loss: 0.6208144909348982
Validation loss: 0.6248991560935974
F1 Score (Weighted): 0.6407317598485458


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=203.0, style=ProgressStyle(description_widt…


Epoch 7
Training loss: 0.608823107234363
Validation loss: 0.6209845995903015
F1 Score (Weighted): 0.6457564904800346


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=203.0, style=ProgressStyle(description_widt…


Epoch 8
Training loss: 0.6025480506161751
Validation loss: 0.6162431979179382
F1 Score (Weighted): 0.6542225567330913


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=203.0, style=ProgressStyle(description_widt…


Epoch 9
Training loss: 0.5975321529827682
Validation loss: 0.6168119215965271
F1 Score (Weighted): 0.6534116636833657


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=203.0, style=ProgressStyle(description_wid…


Epoch 10
Training loss: 0.5846113057559347
Validation loss: 0.6167315137386322
F1 Score (Weighted): 0.6539879304096822


HBox(children=(FloatProgress(value=0.0, description='Epoch 11', max=203.0, style=ProgressStyle(description_wid…


Epoch 11
Training loss: 0.5770710091579137
Validation loss: 0.6146855366230011
F1 Score (Weighted): 0.651752619771053


HBox(children=(FloatProgress(value=0.0, description='Epoch 12', max=203.0, style=ProgressStyle(description_wid…


Epoch 12
Training loss: 0.5650892099136202
Validation loss: 0.6122776043415069
F1 Score (Weighted): 0.6579454638447236


HBox(children=(FloatProgress(value=0.0, description='Epoch 13', max=203.0, style=ProgressStyle(description_wid…


Epoch 13
Training loss: 0.5582490191964681
Validation loss: 0.611037653684616
F1 Score (Weighted): 0.6611489809958658


HBox(children=(FloatProgress(value=0.0, description='Epoch 14', max=203.0, style=ProgressStyle(description_wid…


Epoch 14
Training loss: 0.552019413468873
Validation loss: 0.6106222748756409
F1 Score (Weighted): 0.6629572519202588


HBox(children=(FloatProgress(value=0.0, description='Epoch 15', max=203.0, style=ProgressStyle(description_wid…


Epoch 15
Training loss: 0.5432107541067847
Validation loss: 0.6145997714996337
F1 Score (Weighted): 0.6657422486732003


HBox(children=(FloatProgress(value=0.0, description='Epoch 16', max=203.0, style=ProgressStyle(description_wid…


Epoch 16
Training loss: 0.5327325610104453
Validation loss: 0.6134751987457275
F1 Score (Weighted): 0.6644320471511375


HBox(children=(FloatProgress(value=0.0, description='Epoch 17', max=203.0, style=ProgressStyle(description_wid…


Epoch 17
Training loss: 0.5285175856698323
Validation loss: 0.6129067873954773
F1 Score (Weighted): 0.6705423687940467


HBox(children=(FloatProgress(value=0.0, description='Epoch 18', max=203.0, style=ProgressStyle(description_wid…


Epoch 18
Training loss: 0.5174122401352587
Validation loss: 0.6159235250949859
F1 Score (Weighted): 0.67396773539875


HBox(children=(FloatProgress(value=0.0, description='Epoch 19', max=203.0, style=ProgressStyle(description_wid…


Epoch 19
Training loss: 0.509496054479054
Validation loss: 0.6166990303993225
F1 Score (Weighted): 0.675865167482695


HBox(children=(FloatProgress(value=0.0, description='Epoch 20', max=203.0, style=ProgressStyle(description_wid…


Epoch 20
Training loss: 0.4977314815732646
Validation loss: 0.6207395696640015
F1 Score (Weighted): 0.6752626756687868


HBox(children=(FloatProgress(value=0.0, description='Epoch 21', max=203.0, style=ProgressStyle(description_wid…


Epoch 21
Training loss: 0.4844226650710176
Validation loss: 0.6263114011287689
F1 Score (Weighted): 0.6772877816775669


HBox(children=(FloatProgress(value=0.0, description='Epoch 22', max=203.0, style=ProgressStyle(description_wid…


Epoch 22
Training loss: 0.4749851521893675
Validation loss: 0.6322383725643158
F1 Score (Weighted): 0.6825301730231181


HBox(children=(FloatProgress(value=0.0, description='Epoch 23', max=203.0, style=ProgressStyle(description_wid…


Epoch 23
Training loss: 0.4708878753220507
Validation loss: 0.6412220525741578
F1 Score (Weighted): 0.6707566989531987


HBox(children=(FloatProgress(value=0.0, description='Epoch 24', max=203.0, style=ProgressStyle(description_wid…


Epoch 24
Training loss: 0.4499226017244931
Validation loss: 0.6438527810573578
F1 Score (Weighted): 0.6797261091530621


HBox(children=(FloatProgress(value=0.0, description='Epoch 25', max=203.0, style=ProgressStyle(description_wid…


Epoch 25
Training loss: 0.44781659191171524
Validation loss: 0.6424801850318909
F1 Score (Weighted): 0.6784055029105523


HBox(children=(FloatProgress(value=0.0, description='Epoch 26', max=203.0, style=ProgressStyle(description_wid…


Epoch 26
Training loss: 0.4395357145758098
Validation loss: 0.6461624228954315
F1 Score (Weighted): 0.6705423687940467


HBox(children=(FloatProgress(value=0.0, description='Epoch 27', max=203.0, style=ProgressStyle(description_wid…


Epoch 27
Training loss: 0.43922592171013647
Validation loss: 0.6571436631679535
F1 Score (Weighted): 0.6712834920429223


HBox(children=(FloatProgress(value=0.0, description='Epoch 28', max=203.0, style=ProgressStyle(description_wid…


Epoch 28
Training loss: 0.42731556780819824
Validation loss: 0.6645307743549347
F1 Score (Weighted): 0.6700746796092761


HBox(children=(FloatProgress(value=0.0, description='Epoch 29', max=203.0, style=ProgressStyle(description_wid…


Epoch 29
Training loss: 0.417920347798634
Validation loss: 0.668642406463623
F1 Score (Weighted): 0.6702474274165672


HBox(children=(FloatProgress(value=0.0, description='Epoch 30', max=203.0, style=ProgressStyle(description_wid…


Epoch 30
Training loss: 0.40604402969036196
Validation loss: 0.6723187482357025
F1 Score (Weighted): 0.6656557758526856


HBox(children=(FloatProgress(value=0.0, description='Epoch 31', max=203.0, style=ProgressStyle(description_wid…


Epoch 31
Training loss: 0.3979977686416927
Validation loss: 0.6805819034576416
F1 Score (Weighted): 0.6666804001211631


HBox(children=(FloatProgress(value=0.0, description='Epoch 32', max=203.0, style=ProgressStyle(description_wid…


Epoch 32
Training loss: 0.3935429509637391
Validation loss: 0.6842266774177551
F1 Score (Weighted): 0.6666804001211631


HBox(children=(FloatProgress(value=0.0, description='Epoch 33', max=203.0, style=ProgressStyle(description_wid…


Epoch 33
Training loss: 0.3692384897341282
Validation loss: 0.6917610383033752
F1 Score (Weighted): 0.6681602269698939


HBox(children=(FloatProgress(value=0.0, description='Epoch 34', max=203.0, style=ProgressStyle(description_wid…


Epoch 34
Training loss: 0.37161612635469204
Validation loss: 0.7052259719371796
F1 Score (Weighted): 0.6690343178453317


HBox(children=(FloatProgress(value=0.0, description='Epoch 35', max=203.0, style=ProgressStyle(description_wid…


Epoch 35
Training loss: 0.36330739735382533
Validation loss: 0.7058292150497436
F1 Score (Weighted): 0.6735557224643004


HBox(children=(FloatProgress(value=0.0, description='Epoch 36', max=203.0, style=ProgressStyle(description_wid…


Epoch 36
Training loss: 0.35361511675007823
Validation loss: 0.7233794951438903
F1 Score (Weighted): 0.6638479584651803


HBox(children=(FloatProgress(value=0.0, description='Epoch 37', max=203.0, style=ProgressStyle(description_wid…


Epoch 37
Training loss: 0.34496196752111313
Validation loss: 0.7317623996734619
F1 Score (Weighted): 0.6671663487276919


HBox(children=(FloatProgress(value=0.0, description='Epoch 38', max=203.0, style=ProgressStyle(description_wid…


Epoch 38
Training loss: 0.33375015040042955
Validation loss: 0.7356278729438782
F1 Score (Weighted): 0.6731691292043106


HBox(children=(FloatProgress(value=0.0, description='Epoch 39', max=203.0, style=ProgressStyle(description_wid…

KeyboardInterrupt: ignored

In [None]:
#后面就是拿train完的model在test做测试
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

In [None]:

model.load_state_dict(torch.load('data_volume/finetuned_BERT_epoch_10.model', map_location=torch.device('cuda')))

In [None]:
_, predictions, true_vals = evaluate(dataloader_test)

In [None]:
accuracy_per_class(predictions, true_vals)