In [None]:
!git clone https://github.com/kakao/khaiii.git

!pip install cmake

!mkdir build

!cd build && cmake /content/khaiii

!cd /content/build/ && make all

!cd /content/build/ && make resource

!cd /content/build && make install

!cd /content/build && make package_python

!pip install /content/build/package_python


In [None]:
from khaiii import KhaiiiApi
api = KhaiiiApi()
#간단한 테스트
for word in api.analyze("이거 되냐? 되ㅐ냐고"):
    for morph in word.morphs:
        print(morph.lex)


이거
되
냐
?
되
ㅐ
냐
고


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn.functional as F
import torch.nn as nn
#주요 참고 PyTorch로 시작하는 딥 러닝 입문, 유원준
from torchtext import data  
import urllib.request
import pandas as pd
import random

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#데이터 전처리 후에 남은 문자들이 전부 빈칸일 때 제거하기 위함
def isAll0(x):
    if type(x) == float:
        return x
    elif len(x) == x.count(' '):
        return ''
    else:
        return x


In [None]:
#데이터 가져와서
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

#읽고
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

#학습 데이터 전처리, 영어 특수문자 제거, 공백 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df['document'] = train_df['document'].apply(isAll0)
train_df['document'].replace('', np.nan, inplace=True)

#테스트 데이터 전처리
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'] = test_df['document'].apply(isAll0)
test_df['document'].replace('', np.nan, inplace=True)

#중복 데이터 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
test_df.drop_duplicates(subset=['document'], inplace=True)

#Null 제거
train_df = train_df.dropna(how = 'any')
test_df = test_df.dropna(how = 'any')

In [None]:
print(len(train_df))
print(len(test_df))

143660
48403


In [None]:
#불용어
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
train_x = []

In [None]:
#학습 데이터 형태소로 분리, 문장 시작, 끝 추가
j = 0
for i, row in train_df.iterrows():
    val = row['document']
    j += 1
    if j % 10000 == 0:
        print(i,"/","143660")

    temp = ['CLS']
    for word in api.analyze(val):
        for morph in word.morphs:
            if morph.lex not in stopwords:
                temp.append(morph.lex)
    temp.append('SEP')
    train_x.append(temp)


10180 / 143660
20506 / 143660
30877 / 143660
41284 / 143660
51710 / 143660
62159 / 143660
72591 / 143660
83093 / 143660
93557 / 143660
104038 / 143660
114525 / 143660
125029 / 143660
135592 / 143660
146158 / 143660


In [None]:
test_x = []

In [None]:
#테스트 데이터 형태소로 분리
j = 0
for i, row in test_df.iterrows():
    val = row['document']
    j += 1
    if i % 10000 == 0:
        print(j,"/","48000")
    temp = ['CLS']
    
    for word in api.analyze(val):
        for morph in word.morphs:
            if morph.lex not in stopwords:
                temp.append(morph.lex)
    temp.append('SEP')
    test_x.append(temp)


1 / 48000
9768 / 48000
19467 / 48000
29133 / 48000
38806 / 48000


In [None]:
vocab_size = 6000

#형태소를 정수로 인코딩
tokenizer = Tokenizer(vocab_size,oov_token = 'OOV')
tokenizer.fit_on_texts(train_x)

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

In [None]:
train_y = np.array(train_df['label'])
test_y = np.array(test_df['label'])

In [None]:
# 제거 후 빈칸만 남은 거 또한 제거
drop_train = [index for index, sentence in enumerate(train_x) if len(sentence) < 1]
drop_test = [index for index, sentence in enumerate(test_x) if len(sentence) < 1]

In [None]:
train_x = np.delete(train_x, drop_train, axis=0)
train_y = np.delete(train_y, drop_train, axis=0)
print(len(train_x))
print(len(train_y))

143660
143660


In [None]:
test_x = np.delete(test_x, drop_test, axis=0)
test_y = np.delete(test_y, drop_test, axis=0)
print(len(test_x))
print(len(test_y))

48403
48403


In [None]:
print(train_x)

[list([2, 21, 705, 122, 124, 307, 1540, 74, 757, 3])
 list([2, 1, 9, 8, 718, 1, 773, 675, 12, 43, 384, 3])
 list([2, 30, 178, 1, 11, 6, 1320, 150, 9, 18, 10, 278, 4, 42, 3]) ...
 list([2, 78, 54, 74, 1363, 1, 8, 1, 1, 1058, 6, 3])
 list([2, 1019, 7, 79, 1, 1068, 4, 31, 56, 553, 1, 380, 3])
 list([2, 177, 7, 1708, 27, 1, 4, 94, 1288, 5, 7, 3])]


In [None]:
#패딩
pad_len = 30
train_x = pad_sequences(train_x, maxlen = pad_len,padding='post')
test_x = pad_sequences(test_x, maxlen = pad_len,padding='post')

In [None]:
print(train_x)
print(train_y)

[[   2   21  705 ...    0    0    0]
 [   2    1    9 ...    0    0    0]
 [   2   30  178 ...    0    0    0]
 ...
 [   2   78   54 ...    0    0    0]
 [   2 1019    7 ...    0    0    0]
 [   2  177    7 ...    0    0    0]]
[0 1 0 ... 0 1 0]


In [None]:
#데이터 셋 간단하게
class nlp_dataset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [None]:
#베이스 라인
class grubase(nn.Module):
    def __init__(self, embed_dim, vocab_size, hidden_dim, num_layers, batch_size, dropout):
        super(grubase, self).__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.batch_size = batch_size
        self.sigmoid = nn.Sigmoid()


        self.norm = nn.BatchNorm1d(self.batch_size)
        self.embed = nn.Embedding(self.vocab_size,self.embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.gru = nn.GRU(self.embed_dim, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)
        self.gru2 = nn.GRU(self.hidden_dim*2, self.hidden_dim, self.num_layers,batch_first=True)

        self.mlp1 = nn.Linear(self.hidden_dim,1)

    def forward(self,x):
        x = self.embed(x)
        x = self.dropout(x)

        x, _ = self.gru(x)
        x, _ = self.gru2(x)
        x = x[:,-1,:]
        #x = torch.cat((x[:,0,:],x[:,-1,:]),dim=-1)

        x = self.dropout(x)
        x = self.mlp1(x)
        #x = self.sigmoid(x).squeeze()
        return x.squeeze()


In [None]:
# 튜닝
class grumodel(nn.Module):
    def __init__(self, embed_dim, vocab_size, hidden_dim, num_layers, batch_size, dropout):
        super(grumodel, self).__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.batch_size = batch_size
        self.sigmoid = nn.Sigmoid()


        self.norm = nn.BatchNorm1d(self.batch_size)
        self.embed = nn.Embedding(self.vocab_size,self.embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.gru = nn.GRU(self.embed_dim, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)
        self.gru2 = nn.GRU(self.hidden_dim*2, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)

        self.mlp1 = nn.Linear(self.hidden_dim*4,self.hidden_dim)
        self.mlp2 = nn.Linear(self.hidden_dim,self.hidden_dim//4)
        self.mlp3 = nn.Linear(self.hidden_dim//4,1)

    def forward(self,x):
        x = self.embed(x)
        x = self.dropout(x)

        x, _ = self.gru(x)
        x, _ = self.gru2(x)
        x = torch.cat((x[:,0,:],x[:,-1,:]),dim=-1)

        x = self.dropout(x)
        x = self.mlp1(x)
        x = self.relu(x)
        x = self.mlp2(x)
        x = self.relu(x)
        x = self.mlp3(x)
        return x.squeeze()


In [None]:
#학습
def train(model, optimizer, loss_function,train_loader,DEVICE):
    model.train()
    for batch in train_loader:
        x, y = batch
        x = x.long().to(DEVICE)
        y = y.long().to(DEVICE)
        optimizer.zero_grad()

        y_pred = model(x)
        loss = loss_function(y_pred.to(DEVICE).float(), y.float())
        loss.backward()
        optimizer.step()

In [None]:
# 성능검증 용, F1 스코어, 정확도
def getF1(y_pred,y,threshold=0.5):
    
    yp = [1 if x > threshold else 0 for x in y_pred]

    pp = 0
    pf = 0
    fp = 0
    ff = 0
    for i in range(len(y)):
        if y[i] > threshold:
            if yp[i] > threshold: pp += 1
            else: pf += 1
        else:
            if yp[i] < threshold: ff += 1
            else: fp += 1

    precision = pp / (pp + fp + 1e-5) 
    recall = pp / (pp + ff + 1e-5)
    F1 = 2 * precision * recall / (precision + recall + 1e-5)
    acc = (pp + ff) / (len(y) + 1e-5)
    return F1, acc

In [None]:
# 검증
def evaluate(model, val_loader, loss_function, DEVICE, batch_size, threshold):
    """evaluate model"""
    model.eval()
    total_loss = 0
    total_f1 = 0
    total_acc = 0
    for batch in val_loader:
        x, y = batch
        x = x.long().to(DEVICE)
        y = y.long().to(DEVICE)
        y_pred = model(x)
        loss = loss_function(y_pred.to(DEVICE).float(), y.float())
        f1, acc = getF1(y_pred,y,threshold)
        total_f1 += f1
        total_acc += acc
        total_loss += loss.item()

    size = len(val_loader.dataset) / batch_size
    avg_loss = total_loss / size
    avg_f1 = total_f1 / size
    avg_acc = total_acc / size
    return avg_loss, avg_f1, avg_acc

In [None]:
device = torch.device("cuda")

batch_size = 256
embed_dim = 368
hidden_dim = 512
dropout = 0.7
layers = 1

model = grumodel(embed_dim,vocab_size,hidden_dim,layers,batch_size,dropout)
model.to(device)
loss = nn.BCEWithLogitsLoss(pos_weight = 1.1 * torch.ones([1])).to(device)
lr = 0.001
threshold = 0.5

EPOCHS = 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
train_dataset = nlp_dataset(train_x,train_y)
test_dataset = nlp_dataset(test_x,test_y)

train_loader = DataLoader(train_dataset,batch_size,True,drop_last=True)
val_loader = DataLoader(test_dataset,batch_size,True,drop_last=True)

In [None]:
print(train_dataset[0])

(array([   2,   21,  705,  122,  124,  307, 1540,   74,  757,    3,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int32), 0)


In [None]:

from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
best_val_f1 = 0
for e in range(1, EPOCHS+1):
    train(model, optimizer, loss,train_loader,device)
    val_loss,val_f1,val_acc = evaluate(model, val_loader, loss, device,batch_size,threshold)

    print("[Epoch: %d] val loss : %1.5f    val acc :%4.3f    F1 :%4.3f" % (e, val_loss, val_acc,val_f1))

    # F1 성능 지표로 저장
    if not best_val_f1 or val_f1 > best_val_f1:
        print("Best saved")
        torch.save(model.state_dict(), '/content/gdrive/My Drive/GRUmodel/Khaiii_gru_model.pt')
        best_val_f1 = val_f1

KeyboardInterrupt: ignored

In [None]:
import json

In [None]:
tokenizer_json = tokenizer.to_json()
with open('/content/gdrive/My Drive/GRUmodel/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))