In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
source_folder = '/content/drive/My Drive/data/bert_classifier_1/'
destination_folder = '/content/drive/My Drive/data/bert_classifier_1/'

In [3]:
!pip install transformers



In [4]:
!pip install torchtext==0.6.0



In [5]:
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer, BertForSequenceClassification

In [6]:
df_train = pd.read_csv(source_folder + 'train.tsv', sep='\t')
df_test = pd.read_csv(source_folder + 'test.tsv', sep='\t')

In [7]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

valid_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
valid_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [16]:
train_titles

{0: '亞洲杯奪冠賠率：日本、伊朗領銜 中國竟與泰國並列',
 1: '9輪4球本土射手僅次武磊 黃紫昌要搶最強U23頭銜',
 2: '如果今年勇士奪冠，下賽季詹姆斯何去何從？',
 3: '超級替補！科斯塔本賽季替補出場貢獻7次助攻',
 4: '騎士6天里發生了啥？從首輪搶七到次輪3-0猛龍',
 5: '如果朗多進入轉會市場，哪些球隊適合他？',
 6: '詹姆斯G3決殺，你怎麼看？',
 7: '大魔王帶頭唱歌！火箭這像是打季後賽？爵士神帥這話已提前投降了',
 8: '馬夏爾要去切爾西？可以商量，不過穆里尼奧的要價是4000萬加威廉',
 9: '利希施泰納宣佈賽季結束後離隊：我需要新的挑戰',
 10: '怎麼樣看待大連一方在中超聯賽第九輪取得的賽季首勝？',
 11: '科勒·卡戴珊與男友TT共進午餐，曾在他懷孕期間偷腥的渣男被原',
 12: '作為央視體育體育頻道，CCTV5一到週末就直播馬拉松你怎麼看？',
 13: '如果2018騎士奪冠，詹姆斯這個冠軍的含金量有多大？',
 14: '昔日中超金靴半場獨造6球虐爆遼足 華夏送走他後悔嗎？',
 15: 'NBA歷史排名前十都有誰？',
 16: '你希望利物浦贏得歐冠嗎？巴薩主帥巴爾韋德的回答耐人尋味',
 17: '絕殺！詹姆斯38+7再創傳奇一刻，兩護法創另類神跡更功不可沒',
 18: '再現絕殺！今天的老詹怎麼吹',
 19: '拜仁3比1逆轉科隆，J羅現世界級做餅',
 20: '白送一個點球！恆大都不要？卡納瓦羅笑容真有意思',
 21: '劉詩雯戴罪立功取勝平野美宇為中國女乒拿到世乒賽冠軍，怎麼看待這場比賽？',
 22: '天堂與地獄是有多近？看看比賽最後8秒的猛龍就知道了',
 23: '魯能隊員的身高',
 24: 'WKG&M-1世界綜合格鬥賽明星見面會引爆鵬城',
 25: '如何看待第一詹黑皮爾斯，公開贊美詹姆斯？',
 26: '粵媒：缺少外援的恆大還是不行',
 27: '熱刺作死的節奏！連墊底隊都輸，歐冠席位懸了？',
 28: '德帥談沒球隊面試布茲德里克：出於自私的原因，他在這讓我開心',
 29: '雙刃劍！山東魯能高中鋒讓人又愛又恨',
 30: '克洛普：希望凱塔能迅速起到作用，可能會邀請他去基輔',
 31: '在歐洲是奪歐冠難，還是奪聯賽難？這

In [8]:
ALL_NEWS_CLASSES = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

In [9]:
MODEL_NAME = 'bert-base-chinese'

In [10]:
# 建立數據集
class NewsDataset(Dataset):
    def __init__(self, tokenizer, titles, classes):
        self.tokenizer = tokenizer
        self.indexes = []
        self.texts = []
        self.labels = []
        for index in titles:
            self.indexes.append(index)
            self.texts.append(titles[index])
            self.labels.append(classes[index])

    def __getitem__(self, idx):
        text = self.texts[idx]

        input = self.tokenizer(text, return_tensors='pt')
        label = torch.tensor(ALL_NEWS_CLASSES.index(self.labels[idx]))

        return input, label

    def __len__(self):
        return len(self.indexes)


def create_mini_batch(samples):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []
    for s in samples:
        input_ids.append(s[0]['input_ids'].squeeze(0))
        token_type_ids.append(s[0]['token_type_ids'].squeeze(0))
        attention_mask.append(s[0]['attention_mask'].squeeze(0))
        labels.append(s[1])

    # zero pad 到同一序列長度
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    token_type_ids = torch.nn.utils.rnn.pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
 
    labels = torch.stack(labels)

    return input_ids, token_type_ids, attention_mask, labels

In [11]:
batch_size = 32

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

train_dataset = NewsDataset(tokenizer, train_titles, train_classes)
valid_dataset = NewsDataset(tokenizer, valid_titles, valid_classes)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=create_mini_batch, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, collate_fn=create_mini_batch)

In [12]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

In [13]:
def evaluate(model, valid_loader):
    model.eval()
    device = 'cuda' if next(model.parameters()).is_cuda else 'cpu'

    tot_count = 0
    tot_loss = 0
    tot_correct = 0

    with torch.no_grad():
        for data in valid_loader:
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            
            tot_count += input_ids.size(0)
            tot_loss += outputs.loss.item()
            tot_correct += (outputs.logits.argmax(dim=-1) == labels).sum().item()
    
    evaluation = {
        'loss': tot_loss / tot_count,
        'acc': tot_correct / tot_count
    }
    return evaluation

In [14]:
# 訓練模型
epochs = 5
lr = 0.0001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6, return_dict=True)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

for epoch in range(1, epochs + 1):
    print(f'epoch: {epoch}')

    for i, train_data in enumerate(train_loader):
        loss = train_batch(model, train_data, optimizer, device)
        train_size = train_data[0].size(0)

        if i % 10 == 0:
            print('train_loss: ', loss / train_size)

    evaluation = evaluate(model, valid_loader)
    print('valid_evaluation: loss={loss}, acc={acc}'.format(**evaluation))

    scheduler.step()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

epoch: 1
train_loss:  0.06013582646846771
train_loss:  0.022713661193847656
train_loss:  0.028812989592552185
train_loss:  0.027789849787950516
train_loss:  0.01726982556283474
train_loss:  0.012641426175832748
valid_evaluation: loss=0.012522723814472557, acc=0.89
epoch: 2
train_loss:  0.01023040246218443
train_loss:  0.012173905968666077
train_loss:  0.014499874785542488
train_loss:  0.0020464013796299696
train_loss:  0.004572609439492226
train_loss:  0.020038317888975143
valid_evaluation: loss=0.012667245777944723, acc=0.8933333333333333
epoch: 3
train_loss:  0.007667989004403353
train_loss:  0.004125501029193401
train_loss:  0.005154577549546957
train_loss:  0.005361606832593679
train_loss:  0.001187838613986969
train_loss:  0.006689980626106262
valid_evaluation: loss=0.011159320523341497, acc=0.895
epoch: 4
train_loss:  0.002403319114819169
train_loss:  0.0011242839973419905
train_loss:  0.0007949933642521501
train_loss:  0.009006734006106853
train_loss:  0.004859760869294405
train

In [34]:
test_text = {0: '旅行達人告訴你，旅行不管短途或是長途別忘了帶這些，乾淨還衛生'}

test_data_for_model = NewsDataset(tokenizer, test_text, ALL_NEWS_CLASSES)

In [40]:
# for test
with torch.no_grad():
  input_ids, token_type_ids, attention_mask, labels = create_mini_batch(test_data_for_model)
  outputs = model(input_ids=input_ids.to(device), token_type_ids=token_type_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
  print(ALL_NEWS_CLASSES[outputs[1].argmax(dim=-1)[0]])

旅遊
