# GPUデバイスの確認
最新のTesla T4で無理なのでたぶん無理、諦めろ

In [None]:
!nvidia-smi

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# 準備
## 乾研のBERT用にMecabのインストール
## hugging_faceのtransformersなどのpythonパッケージインストール

In [None]:
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab

!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n

!sed -e "s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g" /etc/mecabrc > /etc/mecabrc.new

!cp /etc/mecabrc /etc/mecabrc.org
!cp /etc/mecabrc.new /etc/mecabrc

!apt-get -q -y install swig
!pip install mecab-python3

In [None]:
!pip install pytorch-transformers transformers torchtext nltk neologdn emoji

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F

import torchtext
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import string
import re

from torchtext.vocab import Vectors
from pytorch_transformers import BertModel, BertConfig, BertForSequenceClassification, BertTokenizer

import random
import math
import numpy as np
import json
import time

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# hugging_faceのtransformersより日本語BERTの取得

In [None]:
bert = 'bert-base-japanese-whole-word-masking'

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(bert, num_labels=9)
model.to(device)
print(model.classifier)

In [None]:
from transformers import BertJapaneseTokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')
tokenizer.tokenize('お腹が痛いので遅れます。')

# livedoorニュースコーパス取得
## pandasで読み込んで、使う

In [None]:
!mkdir dataset
!cd dataset
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar zxvf ldcc-20140209.tar.gz

In [None]:
!echo -e "filename\ttitle\tarticle\tlabel" > ./text/livedoor.tsv

In [None]:
!for filename in `basename -a ./text/dokujo-tsushin/dokujo-tsushin-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/dokujo-tsushin/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/dokujo-tsushin/$filename`; echo -e "\t1"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/it-life-hack/it-life-hack-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/it-life-hack/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/it-life-hack/$filename`; echo -e "\t2"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/kaden-channel/kaden-channel-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/kaden-channel/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/kaden-channel/$filename`; echo -e "\t3"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/livedoor-homme/livedoor-homme-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/livedoor-homme/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/livedoor-homme/$filename`; echo -e "\t4"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/movie-enter/movie-enter-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/movie-enter/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/movie-enter/$filename`; echo -e "\t5"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/peachy/peachy-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/peachy/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/peachy/$filename`; echo -e "\t6"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/smax/smax-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/smax/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/smax/$filename`; echo -e "\t7"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/sports-watch/sports-watch-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/sports-watch/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/sports-watch/$filename`; echo -e "\t8"; done >> ./text/livedoor.tsv
!for filename in `basename -a ./text/topic-news/topic-news-*`; do echo -n "$filename"; echo -ne "\t"; echo -n `sed -n '3p' ./text/topic-news/$filename`; echo -ne "\t"; echo -n `sed -e '1,3d' ./text/topic-news/$filename`; echo -e "\t9"; done >> ./text/livedoor.tsv

In [None]:
mkdir data

# 読み込み

In [None]:
df = pd.read_csv("./text/livedoor.tsv", delimiter='\t')

In [None]:
df

In [None]:
train, test = train_test_split(df, test_size=0.1)

In [None]:
train, val = train_test_split(train, test_size=0.1)

In [None]:
# 扱えるテキストの量が128までだからtitle安定
# articleでもいいけど、入力からされないのがある
train = train[['title', 'label']]
val = val[['title', 'label']]
test = test[['title', 'label']]

In [None]:
train.to_csv('data/train.tsv', sep='\t', index=False)
val.to_csv('data/val.tsv', sep='\t', index=False)
test.to_csv('data/test.tsv', sep='\t', index=False)

In [None]:
import nltk
import neologdn
import unicodedata
import emoji

def cleaning(sentence):
    symbols = ("◆□■△▲▽▼※〒→←↑↓〓∈∋⊆⊇⊂⊃∪∩∧∨￢⇒⇔∀∃∠⊥"
                   "⌒∂∇≡≒≪≫√∽∝∵∫∬Å‰♯♭♪†‡¶◯①②③④⑤⑥⑦⑧⑨"
                   "⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ㍉㌔㌢㍍㌘㌧㌃㌶㍑㍗"
                   "㌍㌦㌣㌫㍊㌻㎜㎝㎞㎎㎏㏄㎡㍻〝〟№㏍℡㊤㊥㊦㊧㊨㈱㈲㈹㍾㍽㍼∮"
                   "∑∟⊿ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ￤＇＂◇◎●○★☆§＠＊＆＃％￡￠＄"
                   "￥℃″′°♀♂∴∞≧≦＞＜≠＝÷×±－＋】【』『」「》《〉〈｝"
                   "｛］［〕〔）（”“’‘‥…｜∥～＼／‐―〆々仝〃ゞゝヾヽ＿￣＾"
                   "¨｀´゜゛；：・，　╂┸┥┰┝┿┷┨┯┠╋┻┫┳┣┗"
                   "┛┓┏┃━┼┴┤┬├└┘┐┌│─〇"
                   "\"#$%&'()*+,-/:;<=>@[\]^_`{|}~")
    sentence = ''.join(sentence)
    sentence = sentence.lower()
    sentence = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', sentence)
    sentence = re.sub(r'\d+\.*\d*', '0', sentence)
    sentence = neologdn.normalize(sentence)
    sentence = unicodedata.normalize("NFKC", sentence)
    sentence = re.sub("[" + symbols + "]", '', sentence)
    return sentence

def tokenizer_with_preprocessing(text, tokenizer=tokenizer.tokenize):
    text = cleaning(text)
    return tokenizer(text)

def get_DataLoaders_and_TEXT(max_length, batch_size):
    #テキストの前処理
    TEXT = torchtext.data.Field(sequential=True, 
                                tokenize=tokenizer_with_preprocessing, 
                                use_vocab=True, 
                                include_lengths=True,
                                batch_first=True,
                                fix_length=max_length,
                                init_token='[CLS]',
                                eos_token='[SEP]',
                                pad_token='[PAD]',
                                unk_token='[UNK]',
                                )
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

    #data setの取得
    train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits(
        path='./data/', 
        train='train.tsv',
        validation='val.tsv',
        test='test.tsv',
        format='tsv',
        skip_header=True,
        fields=[('Text', TEXT), ('Label', LABEL)]
    )

    # ボキャブラリーの作成
    # エラー回避のため一旦仮で作成し、bertのvocabで上書き
    TEXT.build_vocab(train_ds, min_freq=1)
    TEXT.vocab.stoi = tokenizer.vocab

    # Data loaderの作成
    train_dl = torchtext.data.Iterator(train_ds, batch_size=batch_size, train=True)
    val_dl = torchtext.data.Iterator(val_ds, batch_size=batch_size, train=False, sort=False)
    test_dl = torchtext.data.Iterator(test_ds, batch_size=batch_size, train=False, sort=False)

    return train_dl, val_dl, test_dl, TEXT

In [None]:
# max_lengthは128安定、batchは適当でいい
max_length=128
batch_size=32
train_dl, val_dl, test_dl, TEXT = get_DataLoaders_and_TEXT(
    max_length=max_length,
    batch_size=batch_size
)

dataloaders_dict = {"train":train_dl, "val": val_dl}

# BERTモデルへの入力例

In [None]:
batch = next(iter(train_dl))
inputs = batch.Text[0].to(device)  # 文章
labels = batch.Label.to(device)  # ラベル
print(inputs, labels)
loss, logit = model(input_ids=inputs, labels=labels)

# BERTを文書分類でFinetuning

In [None]:
class EarlyStopping:
    """
    Early stops the training if validation loss doesn't improve after a given patience.
    based on: https://github.com/Bjarten/early-stopping-pytorch
    """
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

In [None]:
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs, patience):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # ミニバッチのサイズ
    batch_size = dataloaders_dict["train"].batch_size

    # early stopping
    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=patience, verbose=True)

    # epochのループ
    for epoch in range(num_epochs):
        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
            else:
                net.eval()   # モデルを検証モードに

            epoch_loss = 0.0  # epochの損失和
            epoch_corrects = 0  # epochの正解数
            iteration = 1

            # 開始時刻を保存
            t_epoch_start = time.time()
            t_iter_start = time.time()
            predictions = []
            ground_truths = []

            # データローダーからミニバッチを取り出すループ
            for batch in (dataloaders_dict[phase]):
                # batchはTextとLableの辞書型変数

                # GPUが使えるならGPUにデータを送る
                inputs = batch.Text[0].to(device)  # 文章
                labels = batch.Label.to(device)  # ラベル

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):

                    loss, logit = net(input_ids=inputs, labels=labels)                    
                    #loss = criterion(outputs, labels)  # 損失を計算
                    _, preds = torch.max(logit, 1)  # ラベルを予測
                    predictions.append(preds.cpu().numpy())
                    ground_truths.append(labels.data.cpu().numpy())

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                        if (iteration % 1 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            acc = (torch.sum(preds == labels.data)
                                ).double()/batch_size
                            
                            t_iter_start = time.time()

                    iteration += 1

                    # 損失と正解数の合計を更新
                    epoch_loss += loss.item() * batch_size
                    epoch_corrects += torch.sum(preds == labels.data)

            # epochごとのlossと正解率
            t_epoch_finish = time.time()
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double(
            ) / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                        phase, epoch_loss, epoch_acc))
            
            if phase == 'val':
                early_stopping(epoch_loss, net)

            if early_stopping.early_stop:
                print("Early stopping")
                # load the last checkpoint with the best model
                net.load_state_dict(torch.load('checkpoint.pt'))
                return net

            t_epoch_start = time.time()

    torch.cuda.empty_cache()
    return net

In [None]:
def predict(net, test_dl):
        # GPUが使えるかを確認
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        net.eval()
        net.to(device)
        logits = []
        for batch in test_dl:
            inputs = batch.Text[0].to(device)
            with torch.set_grad_enabled(False):
                logit = net(input_ids=inputs)
                logit = F.softmax(logit[0], dim=1).cpu().numpy()
                logits.append(logit)
        return np.concatenate(logits, axis=0)

In [None]:
def fit(net, dataset,  num_epochs, early_stopping_rounds=10, fine_tuning_type='fast'):
    if fine_tuning_type == 'fast':
        # 1. まず全部を、勾配計算Falseにしてしまう
        for name, param in net.named_parameters():
            param.requires_grad = False
        # 2. 最後のBertLayerモジュールを勾配計算ありに変更
        for name, param in net.bert.encoder.layer[-1].named_parameters():
            param.requires_grad = True
        # 3. 識別器を勾配計算ありに変更
        for name, param in net.classifier.named_parameters():
            param.requires_grad = True
    elif fine_tuning_type == 'full':
        for name, param in net.named_parameters():
            param.requires_grad = True

    optimizer = optim.Adam([
                {'params': net.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
                {'params': net.classifier.parameters(), 'lr': 5e-5}
            ], betas=(0.9, 0.999))

    # 損失関数の設定
    criterion = nn.CrossEntropyLoss()

    # 学習・検証を実行する。
    net = train_model(
        net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs,
        patience=early_stopping_rounds)
    return net

In [None]:
model = fit(model, dataloaders_dict, num_epochs=100, early_stopping_rounds=10)

# predict

In [None]:
y_proba = predict(model, test_dl)

In [None]:
for batch, prob in zip(test_dl, y_proba):
    true = batch.Label[0]
    predict = np.argmax(prob)
    print(true, predict)

# モデルのsaveとload

In [None]:
model.save_pretrained('./classification_model/') # save
model = BertForSequenceClassification.from_pretrained('./classification_model') # load