목표 : 청와대 국민 청원 글 카테고리 분류

참고 : https://dacon.io/competitions/open/235597/codeshare/1803?page=1&dtype=recent


## 1. 필수 라이브러리

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
tf.random.set_seed(777)

import os
import re
from tqdm.auto import tqdm
tqdm.pandas()

## 2. Data 불러오기

In [8]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [9]:
!ls gdrive/MyDrive

 노래   3학년   colab  'Colab Notebooks'   dataset  'Untitled Diagram.drawio'


In [10]:
path = "gdrive/My Drive/dataset/bluehouse"

In [11]:
# category와 data 열만 뽑아냄. 그리고 행에 혹시 모를 결측값이 있다면 제거
train_data = pd.read_csv(os.path.join(path, "train.csv")).iloc[:, 1:].dropna(how='any')

In [12]:
# 결측값이 제거되었다면 인덱스 불일치로 다시 인덱싱
train_data.index = range(len(train_data))

In [12]:
test_data = pd.read_csv(os.path.join(path, "test.csv")).iloc[:, 1:]

In [13]:
train_data.shape, test_data.shape

((39992, 2), (5000, 1))

## 3. 전처리 

In [None]:
def clean_text(text):
    text = text.replace('\\n', ' ') # 줄바꿈 제거

    text = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(text)) #remove punctuation
    text = re.sub(r'\d+','', str(text))# remove number
    text = text.lower() #lower case
    text = re.sub(r'\s+', ' ', text) #remove extra space
    text = re.sub(r'<[^>]+>','',text) #remove Html tags
    text = re.sub(r'\s+', ' ', text) #remove spaces
    text = re.sub(r"^\s+", '', text) #remove space from start
    text = re.sub(r'\s+$', '', text) #remove space from the end
    text = re.sub('[-=+,#:;//●<>▲\?:^$.☆!★()Ⅰ@*\"※~>`\'…》]', ' ', text) # 특수문자 제거
    
    return text

In [None]:
train_data['clear_text'] = train_data['data'].progress_map(clean_text)

  0%|          | 0/39992 [00:00<?, ?it/s]

In [None]:
test_data['clear_text'] = test_data['data'].progress_map(clean_text)

  0%|          | 0/5000 [00:00<?, ?it/s]

## 4. tokenizer 및 불용어 제거
- Using Mecab for toeknizing

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

--2022-05-01 11:01:52--  https://www.dropbox.com/s/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz [following]
--2022-05-01 11:01:52--  https://www.dropbox.com/s/dl/9xls0tgtf3edgns/mecab-0.996-ko-0.9.2.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca7d019981501b773cb26a39042.dl.dropboxusercontent.com/cd/0/get/BkY9UUqsnApIv9Ah3JjKLlwquEp09Hm-g-fTCvHVXqTOpyna_3NYfcDHQOE6B566aGClCS2eD8_WxxPL5GpchjG5C3hgMZJAdk-saRA6GydQS_mRWwc0Y0rva3YWSnUbj1Y9824Y51X6yzHwd20kqsIDRSeMRVX-8gK_jQohmgEVZCBNz0Io8_4hXnVLIaLUKGI/file?dl=1# [following]
--2022-05-01 11:01:53--  https://uca7d019981501b773cb26a39042.dl.dropboxusercontent.com/cd/0/get/

In [None]:
from konlpy.tag import Mecab

mecab = Mecab()

In [None]:
stop_df = pd.read_csv(os.path.join(path, '한국어불용어100.txt'), sep = '\t', header = None, names = ['형태','품사','비율'])
stop_df.head()

Unnamed: 0,형태,품사,비율
0,이,VCP,0.01828
1,있,VA,0.011699
2,하,VV,0.009774
3,것,NNB,0.009733
4,들,XSN,0.006898


In [None]:
stop_words = list(stop_df.형태)
stop_words

['이',
 '있',
 '하',
 '것',
 '들',
 '그',
 '되',
 '수',
 '이',
 '보',
 '않',
 '없',
 '나',
 '사람',
 '주',
 '아니',
 '등',
 '같',
 '우리',
 '때',
 '년',
 '가',
 '한',
 '지',
 '대하',
 '오',
 '말',
 '일',
 '그렇',
 '위하',
 '때문',
 '그것',
 '두',
 '말하',
 '알',
 '그러나',
 '받',
 '못하',
 '일',
 '그런',
 '또',
 '문제',
 '더',
 '사회',
 '많',
 '그리고',
 '좋',
 '크',
 '따르',
 '중',
 '나오',
 '가지',
 '씨',
 '시키',
 '만들',
 '지금',
 '생각하',
 '그러',
 '속',
 '하나',
 '집',
 '살',
 '모르',
 '적',
 '월',
 '데',
 '자신',
 '안',
 '어떤',
 '내',
 '내',
 '경우',
 '명',
 '생각',
 '시간',
 '그녀',
 '다시',
 '이런',
 '앞',
 '보이',
 '번',
 '나',
 '다른',
 '어떻',
 '여자',
 '개',
 '전',
 '들',
 '사실',
 '이렇',
 '점',
 '싶',
 '말',
 '정도',
 '좀',
 '원',
 '잘',
 '통하',
 '소리',
 '놓']

In [None]:
def tokenizer_and_stopword(text):
    temp_X = mecab.nouns(text) # 문장에서 명사 추출
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    temp_X = [word for word in temp_X if len(word) > 1] # 한글자 이상인 것만 추출

    return ' '.join(temp_X)

In [None]:
# mocab nouns 테스트 
print(mecab.nouns(train_data.loc[0, 'clear_text']))

['신혼', '부부', '주택', '정책', '보육', '시설', '국민', '세금', '일부', '정책', '보편', '국민', '수긍', '수', '복지', '정책', '저', '신혼', '부부', '당첨', '사람', '로또', '주택', '정책', '반대', '국민', '세금', '일부', '사람', '식', '세금', '우리', '수', '보육', '시설', '전국', '설치', '기업', '솔선수범', '사업장', '의무', '설치', '수', '집', '애', '데', '경력', '단절', '게', '집', '개인', '능력', '게', '능력', '수', '육아', '전담', '힘', '게', '우리', '아이', '부모', '거', '이제', '국가', '시대', '게', '부동산', '가격', '게', '정부', '정책', '부동산', '역효과']


In [None]:
train_data['clear_text2'] = train_data['clear_text'].progress_map(tokenizer_and_stopword)

  0%|          | 0/39992 [00:00<?, ?it/s]

In [None]:
test_data['clear_text2'] = test_data['clear_text'].progress_map(tokenizer_and_stopword)

  0%|          | 0/5000 [00:00<?, ?it/s]

## 단어 집합 만들기 및 인코딩

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

In [None]:
# pandas series -> list형으로 반환 후 단어사전 만듬
tokenizer.fit_on_texts(train_data['clear_text2'].tolist())

In [None]:
tokenizer.word_index

{'아이': 1,
 '학교': 2,
 '국민': 3,
 '학생': 4,
 '교육': 5,
 '나라': 6,
 '여성': 7,
 '선수': 8,
 '청원': 9,
 '국가': 10,
 '교사': 11,
 '대한민국': 12,
 '처벌': 13,
 '사건': 14,
 '폐지': 15,
 '대통령': 16,
 '정부': 17,
 '청소년': 18,
 '필요': 19,
 '이상': 20,
 '이유': 21,
 '부모': 22,
 '피해자': 23,
 '한국': 24,
 '인권': 25,
 '유치원': 26,
 '조사': 27,
 '방송': 28,
 '어린이집': 29,
 '대학': 30,
 '지원': 31,
 '남성': 32,
 '저희': 33,
 '제도': 34,
 '선생': 35,
 '상황': 36,
 '대표': 37,
 '관련': 38,
 '정책': 39,
 '범죄': 40,
 '내용': 41,
 '올림픽': 42,
 '남자': 43,
 '기사': 44,
 '반대': 45,
 '가족': 46,
 '피해': 47,
 '평등': 48,
 '경찰': 49,
 '축구': 50,
 '개인': 51,
 '사용': 52,
 '의무': 53,
 '게임': 54,
 '가해자': 55,
 '언론': 56,
 '이번': 57,
 '공부': 58,
 '문화': 59,
 '현실': 60,
 '감사': 61,
 '보호': 62,
 '초등': 63,
 '가능': 64,
 '뉴스': 65,
 '폭력': 66,
 '생활': 67,
 '마음': 68,
 '시험': 69,
 '감독': 70,
 '운영': 71,
 '폭행': 72,
 '평가': 73,
 '수업': 74,
 '가정': 75,
 '차별': 76,
 '수사': 77,
 '보육': 78,
 '엄마': 79,
 '자유': 80,
 '겁니다': 81,
 '자기': 82,
 '해결': 83,
 '수능': 84,
 '요청': 85,
 '기관': 86,
 '결과': 87,
 '누구': 88,
 '과정': 89,
 '이해': 90,
 '부탁': 9

In [None]:
total_cnt = len(tokenizer.word_index) # 단어의 수
total_cnt

45441

In [None]:
vocab_size = 30000 # vocab_size를 30000개로 고정하자. 너무 빈도가 없는 단어는 어느정도 버리는게 맞는듯
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_data['clear_text2'].tolist())

len(tokenizer.word_index)

45441

In [None]:
X_train = tokenizer.texts_to_sequences(train_data['clear_text2'].tolist())
X_test = tokenizer.texts_to_sequences(test_data['clear_text2'].tolist())

In [None]:
max_len = 300 # 최대 길이를 300으로 설정. 그리고 300이 안되는 녀석들은 padding 처리

pad_X_train = pad_sequences(X_train, maxlen = max_len)
pad_X_test = pad_sequences(X_test, maxlen = max_len)

In [None]:
y_train = to_categorical(np.array(train_data['category']))

## 간단한 모델로 학습 및 테스트

In [None]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,  ReduceLROnPlateau
from tensorflow.keras.regularizers import *
from tensorflow.keras.optimizers import Adam, RMSprop

In [None]:
embedding_dim = 64

model1 = Sequential()

model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(Conv1D(filters=64, kernel_size=5, activation = 'relu', padding = 'same', kernel_regularizer=l2(0.01), kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02)))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(3, activation = 'softmax', kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02)))

model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 300, 64)           1920000   
                                                                 
 conv1d_6 (Conv1D)           (None, 300, 64)           20544     
                                                                 
 global_max_pooling1d_6 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,940,739
Trainable params: 1,940,739
Non-trainable params: 0
_________________________________________________________________


In [None]:
model1.compile(optimizer=RMSprop(lr = .0005), loss='categorical_crossentropy', metrics=['acc'])

reLR = ReduceLROnPlateau(patience = 5, verbose=1, factor = .2)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience = 2)
mc = ModelCheckpoint(filepath = os.path.join(path, '1028_1.h5'), monitor='val_acc', mode='max', verbose=1, save_best_only=True, restore_best_weights = True)

  super(RMSprop, self).__init__(name, **kwargs)


In [None]:
val_rate = 0.2

history1 = model1.fit(pad_X_train, y_train, epochs=30, batch_size= 64, shuffle = True, validation_split=val_rate, verbose = 1, callbacks=[es, mc, reLR])

In [None]:
model1 = load_model( os.path.join(path, '1028_1.h5')) # val_acc = 0.8692

y_prob = model1.predict(pad_X_test, verbose=0) 
predicted = y_prob.argmax(axis=-1)

In [None]:
predicted

array([0, 2, 1, ..., 1, 0, 2])

In [None]:
test_data['category'] = predicted

In [None]:
# 흠.. 이렇게 해서 제출하려고 했더니 형식이 안맞다네... 형식을 알려주던가
test_data.to_csv(os.path.join(path, 'submission.csv'), encoding='utf-8', index = False)

## 실제 예시로 테스트 해보기

In [37]:
test_example = '유소년 아이스하키 선수들의 꿈을 짓밟은 ‘인천 **국제빙상장위탁업체를’를 고발 합니다.'
test_example = clean_text(test_example)
test_example = tokenizer_and_stopword(test_example)
test_arr = tokenizer.texts_to_sequences([test_example])
test_arr = pad_sequences(test_arr, maxlen = max_len)

In [38]:
test_example

'유소년 아이스하키 선수 인천 국제 상장 위탁 업체 고발'

In [39]:
model1.predict([test_arr], verbose = 0).argmax(axis=-1)

# 	0 : 인권/성평등
# 	1 : 문화/예술/체육/언론
# 	2 : 육아/교육

array([1])

## LSTM 으로 학습 및 테스트

In [None]:
embedding_dim = 64

model2 = Sequential()

model2.add(Embedding(vocab_size, 32, input_length=max_len))
model2.add(Dropout(0.3))
model2.add(Conv1D(32, 5, activation='relu'))
model2.add(MaxPooling1D(pool_size=4))
model2.add(LSTM(32))
model2.add(Dense(3, activation='softmax'))

model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 32)           960000    
                                                                 
 dropout (Dropout)           (None, 300, 32)           0         
                                                                 
 conv1d (Conv1D)             (None, 296, 32)           5152      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 74, 32)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                      

In [35]:
model2.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr = .0005), metrics=['acc'])

reLR = ReduceLROnPlateau(patience = 5, verbose=1, factor = .2)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience = 2)
mc = ModelCheckpoint(filepath = os.path.join(path, 'lstm.h5'), monitor='val_acc', mode='max', verbose=1, save_best_only=True, restore_best_weights = True)

  super(RMSprop, self).__init__(name, **kwargs)


In [36]:
val_rate = 0.2

history2 = model2.fit(pad_X_train, y_train, epochs=30, batch_size= 64, shuffle = True, validation_split=val_rate, verbose = 1, callbacks=[es, mc, reLR])

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.86848, saving model to gdrive/My Drive/dataset/bluehouse/lstm.h5
Epoch 2/30
Epoch 2: val_acc did not improve from 0.86848
Epoch 3/30
Epoch 3: val_acc did not improve from 0.86848
Epoch 3: early stopping


## kobert 사용해보기

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3 # 최신 버전으로 설치하면 "Input: must be Tensor, not str" 라는 에러 발생
!pip install torch

!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master


In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [2]:
##GPU 사용 시
device = torch.device("cuda:0")

bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /content/.cache/kobert_v1.zip
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [3]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [42]:
# Setting parameters
max_len = 100
batch_size = 32
warmup_ratio = 0.1
num_epochs = 2
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [43]:
dataset_list = []
for q, label in zip(train_data['data'], train_data['category']):
    data = []
    data.append(q)
    data.append(str(label))

    dataset_list.append(data)

In [44]:
dataset_list[0]

['신혼부부위한 주택정책 보다 보육시설 늘려주세요.. 국민세금으로 일부를 위한 정책펴지 마시고\\n보편적으로 모든국민이 수긍할  수 있는 복지정책 펴 주시길 바랍니다.\\n저도 신혼부부이지만 당첨되는 사람 로또되는 이런주택정책 반대합니다.\\n국민세금을 일부 사람들에게 퍼주기식이 되면 안되죠..\\n그 세금으로 우리아이 안전하게 맡길 수 있는 보육시설을 전국에 설치해 주세요..\\n대기업들은 솔선수범해서 모든 사업장에 의무설치 할 수 있도록 하시구요..\\n집 보다 애 맡길데가 없어 경력단절 되는게 더 괴롭습니다.!\\n집은 개인의 능력을 키워 사는게 맞습니다.\\n그 능력을 키울수 있도록 육아 전담에 힘을 기울이는게 맞습니다.\\n우리아이 부모가 키우는거 맞지만 이제는 국가가\\n책임지는 시대로 가는게 맞다고 봅니다.\\n그렇잖아도 부동산 가격 자꾸 올라가는게 정부정책이 잘못 되었다고 봅니다.\\n부동산은 그냥 내버려 두세요!  좀!\\n건들수록 역효과네요..',
 '2']

In [50]:
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(dataset_list, test_size=0.25, random_state=0)

In [51]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [52]:
data_train[0]

(array([   2, 2149, 7641, 6116, 2355, 6197, 1683, 1419, 5936, 3439, 6137,
        5772,  517,   54, 4958, 6923, 7088,  517, 6186, 6122, 4924, 6749,
        6705, 6999,  517,    0,  517,  425, 6643, 6730, 5337, 6896,  517,
           0,  517,  425, 6629, 6749, 5712, 5874, 3079, 6797, 6971, 3260,
        7202, 4955, 6643, 5337, 6896, 2135, 5377, 7748, 7096, 1772, 6629,
        3867,  517,    0,  517,  425, 7848, 6995, 2877, 1133, 5936, 2149,
        7641, 6116, 3093, 5758, 5439, 4939, 3135, 5782, 5439, 4986, 3295,
        6122, 7659,  517, 7482, 6903, 3676, 6855, 7344, 4873,  517,  463,
         517,  463,  517,  463, 4099,  517,    0,  517,  425, 6573, 7344,
           3], dtype=int32),
 array(100, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [53]:
# torch 형식의 dataset
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=2)

In [36]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [37]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [38]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
train_dataloader



<torch.utils.data.dataloader.DataLoader at 0x7f48ca230cd0>

In [54]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [55]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/938 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.0708918496966362 train acc 0.96875
epoch 1 batch id 201 loss 0.2665047347545624 train acc 0.9151119402985075
epoch 1 batch id 401 loss 0.30643728375434875 train acc 0.9146664588528678
epoch 1 batch id 601 loss 0.07293830811977386 train acc 0.9139455074875208
epoch 1 batch id 801 loss 0.1143142357468605 train acc 0.9125702247191011
epoch 1 train acc 0.912366737739872


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/313 [00:00<?, ?it/s]

epoch 1 test acc 0.9189297124600639


  0%|          | 0/938 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.049975987523794174 train acc 0.96875
epoch 2 batch id 201 loss 0.20473061501979828 train acc 0.9401430348258707
epoch 2 batch id 401 loss 0.19345524907112122 train acc 0.9400716957605985
epoch 2 batch id 601 loss 0.03632565960288048 train acc 0.9394238768718802
epoch 2 batch id 801 loss 0.18430432677268982 train acc 0.937539013732834
epoch 2 train acc 0.9379930703624734


  0%|          | 0/313 [00:00<?, ?it/s]

epoch 2 test acc 0.9205984710178001


In [56]:
torch.save(model.state_dict(),  os.path.join(path, "news.pt"))
# os.path.join(path, 'lstm.h5')


In [57]:
modelload = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
modelload.load_state_dict(torch.load(os.path.join(path, "news.pt"), device))

<All keys matched successfully>

In [62]:
def testModel(model, seq):
    cate = ["인권/성평등","문화/예술/체육/언론","육아/교육"]
    tmp = [seq]
    transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
    tokenized = transform(tmp)

    modelload.eval()
    result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
    idx = result.argmax().cpu().item()

    print("뉴스의 카테고리는:", cate[idx])
    # print("신뢰도는:", "{:.2f}%".format(softmax(result,idx)))

In [65]:
testModel(modelload, "조국 전장관의 딸 입학당시의 입학생들 모두에 대한 전수조사를 요청합니다")
# 	0 : 인권/성평등
# 	1 : 문화/예술/체육/언론
# 	2 : 육아/교육

뉴스의 카테고리는: 육아/교육
