<a href="https://colab.research.google.com/github/diqnfl777/2022F-Ajou-ML-TEAM3/blob/main/LibraryModelMultiLang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##도서관 다국어 제목 분류 모델

### 패키지 설치 및 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# 트랜스포머 설치
!pip install transformers

In [None]:
from transformers import AutoTokenizer
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb

from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook


### 데이터 가공 파트


In [None]:
#데이터 가공
#청구기호 숫자 앞 2개만 따와서 각각 매핑. 65~70은 예외적으로 경영학
data = pd.read_csv("/content/gdrive/MyDrive/LibraryCsv/hapbontocsv.CSV",encoding = 'cp949')
data = data.loc[:,['제목', '청구번호']]
data = data.dropna()
data

Unnamed: 0,제목,청구번호
0,海南 黃山理 分吐遺蹟.1,951.988 a
1,대학수학능력시험의 영어 독해문제 분석 및 개선방안,000 이66대
2,韓民族 統一聯邦國家 建設을 위한 올바른 歷史의 認識과 그 成就를 위한 提言 : 지...,000 이883한
3,The consultant's calling : bringing who you ar...,001 B445c
4,The flawless consulting fieldbook and companio...,001 B651f
...,...,...
436311,파라과이 便覽,R 989.2 대658파
436312,우루과이 便覽,R 989.5 대658우 `96
436313,뉴질랜드 便覽,R 993 대658뉴'96
436314,Year book Australia,R 994 C346y `86


In [None]:
import re 
data['청구번호'] = data['청구번호'].replace({r'(.*?)(\d{2})\d.*' : r'\2'}, regex=True)
data

Unnamed: 0,제목,청구번호
0,海南 黃山理 分吐遺蹟.1,95
1,대학수학능력시험의 영어 독해문제 분석 및 개선방안,00
2,韓民族 統一聯邦國家 建設을 위한 올바른 歷史의 認識과 그 成就를 위한 提言 : 지...,00
3,The consultant's calling : bringing who you ar...,00
4,The flawless consulting fieldbook and companio...,00
...,...,...
436311,파라과이 便覽,98
436312,우루과이 便覽,98
436313,뉴질랜드 便覽,99
436314,Year book Australia,99


In [None]:
data['청구번호'] = pd.to_numeric(data['청구번호'],errors = 'coerce')
data = data.dropna()
data.loc[(data['청구번호'] < 10), ['청구번호']] = 1000
data.loc[(data['청구번호'] < 20), ['청구번호']] = 1001
data.loc[(data['청구번호'] < 30), ['청구번호']] = 1002
data.loc[(data['청구번호'] < 40), ['청구번호']] = 1003
data.loc[(data['청구번호'] < 50), ['청구번호']] = 1004
data.loc[(data['청구번호'] < 60), ['청구번호']] = 1005
data.loc[(data['청구번호'] < 65), ['청구번호']] = 1006 #경영학쪽은 따로 분류이기 떄문에 65 사용
data.loc[(data['청구번호'] < 70), ['청구번호']] = 1007 #즉, 1007 쪽이 경영학책
data.loc[(data['청구번호'] < 80), ['청구번호']] = 1008
data.loc[(data['청구번호'] < 90), ['청구번호']] = 1009
data.loc[(data['청구번호'] <  100), ['청구번호']] = 1010

labels = {'0':'총류',
          '1':'철학',
          '2':'종교',
          '3':'사회학',
          '4':'언어',
          '5':'자연과학',
          '6':'기술과학',
          '7':'경영학',
          '8':'예술',
          '9':'문학',
          '10':'역사'
          }

In [None]:
data['청구번호'] = data['청구번호']%1000
data['청구번호'] = data['청구번호'].astype(int)

In [None]:
'''
data['청구번호'] = data['청구번호'].apply(str) #스트링 변환, 참고용. 모델 돌리려면 실행하면 안됩니다
data['청구번호'] = data['청구번호'].replace(labels)
data
'''

In [None]:
#input_string = "Peace-building and development in Guatemala and Northern Ireland"
regex = '[0-9|A-Z|a-z|ㄱ-ㅎ|ㅏ-ㅣ|가-힣|\s]*[ㄱ-ㅎ|ㅏ-ㅣ|가-힣][0-9|A-Z|a-z|ㄱ-ㅎ|ㅏ-ㅣ|가-힣|\s]*'
kor_data = data[data.제목.str.fullmatch(regex)]                                 # 한글이 있다면 따로 빼내기
eng_data = pd.concat([data, kor_data, kor_data]).drop_duplicates(keep=False)

kor_data

Unnamed: 0,제목,청구번호
1,대학수학능력시험의 영어 독해문제 분석 및 개선방안,0
5,위험한 생각들,0
34,무지의 사전,0
38,21세기 지구에 등장한 새로운 지식,0
42,두 문화,0
...,...,...
436189,독도연감,10
436250,중국현대사사전,10
436276,이슬람 사전,10
436281,싱가포르 편람,10


### 학습

In [None]:
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.7):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 11)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from tokenizers import Tokenizer
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig


en_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class EN_Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels =  df['청구번호'].values
        self.dic = [en_tokenizer(text, padding='max_length', max_length = 64, truncation=True, return_tensors="pt") for text in df['제목']]
    
    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def __getitem__(self, idx):

        batch_texts = self.dic[idx]
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
eng_data_sample = eng_data#.sample(frac = 0.8,random_state= 321) 
np.random.seed(212)
df_train, df_val, df_test = np.split(eng_data_sample.sample(frac=1, random_state=42), 
                                     [int(.8*len(eng_data_sample)), int(.9*len(eng_data_sample))])
print("train:", len(df_train), "val:",len(df_val),"test:", len(df_test))

train: 250820 val: 31353 test: 31353


In [None]:
from torch.optim import AdamW
from tqdm import tqdm
from transformers.optimization import get_cosine_schedule_with_warmup


def train(model, train_data, val_data, learning_rate, epochs):

    train, val = EN_Dataset(train_data), EN_Dataset(val_data)

    warmup_rate = 0.1
    batch_size=64
    num_total_steps = (len(df_train) // batch_size) * epochs
    num_warmup_steps = num_total_steps * warmup_rate

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True,num_workers=3)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr= learning_rate)
    scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                         num_warmup_steps=num_warmup_steps, 
                                                         num_training_steps=num_total_steps)
    if use_cuda:
            print("using cuda!")
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for i, (train_input, train_label) in enumerate(tqdm(train_dataloader)) :

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
                scheduler.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .4f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .4f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}') 
                  
EPOCHS = 4
model = BertClassifier()
LR = 0.000005          
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


using cuda!


100%|██████████| 3920/3920 [44:14<00:00,  1.48it/s]


Epochs: 1 | Train Loss:  0.018                 | Train Accuracy:  0.630                 | Val Loss:  0.013                 | Val Accuracy:  0.731


100%|██████████| 3920/3920 [43:57<00:00,  1.49it/s]


Epochs: 2 | Train Loss:  0.012                 | Train Accuracy:  0.767                 | Val Loss:  0.012                 | Val Accuracy:  0.753


100%|██████████| 3920/3920 [44:27<00:00,  1.47it/s]


Epochs: 3 | Train Loss:  0.009                 | Train Accuracy:  0.813                 | Val Loss:  0.012                 | Val Accuracy:  0.759


100%|██████████| 3920/3920 [44:25<00:00,  1.47it/s]


Epochs: 4 | Train Loss:  0.008                 | Train Accuracy:  0.841                 | Val Loss:  0.012                 | Val Accuracy:  0.757


###테스트

In [None]:
def evaluate(model, test_data):

    test = EN_Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, df_test)

Test Accuracy:  0.762


In [None]:
def get_top_values(model, text):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
      model = model.cuda()
  with torch.no_grad():
    dic = en_tokenizer(text, padding='max_length', max_length = 256, truncation=True, return_tensors="pt")
    input_id = dic['input_ids']
    input_id = input_id.to(device)
    mask = dic['attention_mask']
    mask = mask.to(device)
    output = model(input_id, mask)
    output = F.softmax(output[0],dim=-1)
    print("max: ",torch.topk(output,3))
get_top_values(model, "history") # 확인해보고 싶은 제목 입력
labels

### 저장 및 불러오기

In [None]:
def save_checkpoint(save_path, model, valid_loss):# ------ 모델 평가를 위해 훈련 과정을 저장
    if save_path == None:
        return
    state_dict = {'model_state_dict': model.state_dict(), 'valid_loss': valid_loss}
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):# ------ save_checkpoint 함수에서 저장된 모델을 가져오기.
    if load_path == None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

In [None]:
save_checkpoint("/content/gdrive/MyDrive/Learned/fullLearn.multiLangModel", model, 0.012)

Model saved to ==> /content/gdrive/MyDrive/Learned/fullLearn.multiLangModel


In [None]:
device =torch.device("cuda")
load_checkpoint("/content/gdrive/MyDrive/Learned/0.1_loss1e-6_epoch5.multiLangModel",model)