In [1]:
!pip install torch torchvision




In [None]:
!git clone https://github.com/kakao/khaiii.git

!pip install cmake

!mkdir build

!cd build && cmake /content/khaiii

!cd /content/build/ && make all

!cd /content/build/ && make resource

!cd /content/build && make install

!cd /content/build && make package_python

!pip install /content/build/package_python


In [3]:

import datetime
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np
import argparse
import time
from copy import deepcopy # Add Deepcopy for args

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [6]:
#데이터 셋 간단하게
class nlp_dataset(Dataset):
    def __init__(self,x):
        self.x = x

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        return x

In [7]:
# 튜닝
class grumodel(nn.Module):
    def __init__(self, embed_dim, vocab_size, hidden_dim, num_layers, batch_size, dropout):
        super(grumodel, self).__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.batch_size = batch_size
        self.sigmoid = nn.Sigmoid()


        self.norm = nn.BatchNorm1d(self.batch_size)
        self.embed = nn.Embedding(self.vocab_size,self.embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.gru = nn.GRU(self.embed_dim, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)
        self.gru2 = nn.GRU(self.hidden_dim*2, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)

        self.mlp1 = nn.Linear(self.hidden_dim*4,self.hidden_dim)
        self.mlp2 = nn.Linear(self.hidden_dim,self.hidden_dim//4)
        self.mlp3 = nn.Linear(self.hidden_dim//4,1)

    def forward(self,x):
        x = self.embed(x)
        x = self.dropout(x)

        x, _ = self.gru(x)
        x, _ = self.gru2(x)
        x = torch.cat((x[:,0,:],x[:,-1,:]),dim=-1)

        x = self.dropout(x)
        x = self.mlp1(x)
        x = self.relu(x)
        x = self.mlp2(x)
        x = self.relu(x)
        x = self.mlp3(x)
        return x.squeeze()


In [8]:
device = torch.device("cuda")

vocab_size = 6000
pad_len = 30

batch_size = 256
embed_dim = 368
hidden_dim = 512
dropout = 0.7
layers = 1

model = grumodel(embed_dim,vocab_size,hidden_dim,layers,batch_size,dropout)
modelPath = '/content/gdrive/My Drive/GRUmodel/Khaiii_gru_model.pt'
model.load_state_dict(torch.load(modelPath))
model.to(device)
loss = nn.BCEWithLogitsLoss(pos_weight = 1.1 * torch.ones([1])).to(device)
lr = 0.001
threshold = 0.5

EPOCHS = 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.eval()

grumodel(
  (relu): ReLU()
  (sigmoid): Sigmoid()
  (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (embed): Embedding(6000, 368)
  (dropout): Dropout(p=0.7, inplace=False)
  (gru): GRU(368, 512, batch_first=True, bidirectional=True)
  (gru2): GRU(1024, 512, batch_first=True, bidirectional=True)
  (mlp1): Linear(in_features=2048, out_features=512, bias=True)
  (mlp2): Linear(in_features=512, out_features=128, bias=True)
  (mlp3): Linear(in_features=128, out_features=1, bias=True)
)

In [9]:
#토크나이저 가져오기
tokenizer = Tokenizer(vocab_size,oov_token = 'OOV')
with open('/content/gdrive/My Drive/GRUmodel/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

In [10]:
#불용어
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [11]:
from khaiii import KhaiiiApi
api = KhaiiiApi()
#간단한 테스트

temp = []
for word in api.analyze("너무 재밌어요"):
    for morph in word.morphs:
        if morph.lex not in stopwords:
                temp.append(morph.lex)


In [12]:
print(temp)

['너무', '재밌', '어요']


In [22]:
def predict(original_sentence):
    sentence = ['CLS']
    for word in api.analyze(original_sentence):
        for morph in word.morphs:
            if morph.lex not in stopwords:
                    sentence.append(morph.lex)
    sentence.append('SEP')
    x = tokenizer.texts_to_sequences([sentence])
    x = pad_sequences(x, maxlen = pad_len,padding='post') # 패딩
    x = torch.tensor(x).to(device).long()
    
    y = model(x)
    if(y > 0):
        return 'P'
    else:
        return 'B'

In [28]:
print(predict(" 씨발"))

B


In [15]:
!pip install pymysql
import pymysql

Collecting pymysql
[?25l  Downloading https://files.pythonhosted.org/packages/1a/ea/dd9c81e2d85efd03cfbf808736dd055bd9ea1a78aea9968888b1055c3263/PyMySQL-0.10.1-py2.py3-none-any.whl (47kB)
[K     |██████▉                         | 10kB 28.5MB/s eta 0:00:01[K     |█████████████▊                  | 20kB 3.1MB/s eta 0:00:01[K     |████████████████████▌           | 30kB 4.1MB/s eta 0:00:01[K     |███████████████████████████▍    | 40kB 4.4MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.7MB/s 
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-0.10.1


In [42]:
conn = pymysql.connect(host='jukerdb.cwhsnjoqybdo.ap-northeast-2.rds.amazonaws.com', user='admin', password='',
                       db='WEB', charset='utf8')
 
try:
    # Connection 으로부터 Cursor 생성
    curs = conn.cursor()

    sql = "select * from reply where class = %s"
    curs.execute(sql,'N')
    reply = curs.fetchall()
    #print(reply)     # 전체 rows

    sql = "select * from rereply where class = %s"
    curs.execute(sql,'N')
    reply2 = curs.fetchall()
    #print(reply2)     # 전체 rows

    update_temp = []
    update_temp2 = []
    for line in reply:
        # 0 pageid / 1 reid / 5 content
        update_temp.append((line[0],line[1],predict(line[5])))
    for line in reply2:
        # 0 pageid / 1 reid / 2 rereid / 5 content
        update_temp2.append((line[0],line[1],line[2],predict(line[6])))

    for update in update_temp:
        sql = """UPDATE reply SET class = %s WHERE pageid = %s AND reid = %s"""
        val = (update[2],update[0],update[1])
        curs.execute(sql,val)
        conn.commit()

    for update in update_temp2:
        sql = """UPDATE rereply SET class = %s WHERE pageid = %s AND reid = %s AND rereid = %s"""
        val = (update[3],update[0],update[1],update[2])
        curs.execute(sql,val)
        conn.commit()

finally:
    # Connection 닫기
    conn.close()