In [27]:
!pip install konlpy
!pip install torchtext



In [28]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn.functional as F
import torch.nn as nn
from konlpy.tag import Okt
#주요 참고 PyTorch로 시작하는 딥 러닝 입문, 유원준
from torchtext import data  
import urllib.request
import pandas as pd
import random

In [29]:

from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [30]:
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f281f74e1f8>

In [34]:
'''

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")


train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')
train_df.head(10)

train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df.drop_duplicates(subset=['document'], inplace=True)
test_df.drop_duplicates(subset=['document'], inplace=True)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df.to_csv("/content/gdrive/My Drive/datas/ratings_train.csv", mode='w',index=False)
test_df.to_csv("/content/gdrive/My Drive/datas/ratings_test.csv", mode='w',index=False)

'''

In [35]:
tokenizer = Okt()
# 필드 정의
ID = data.Field(sequential = False,
                use_vocab = False) 

TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=tokenizer.morphs, 
                  lower=True,
                  batch_first=True,
                  fix_length=128)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   is_target=True)

In [36]:
from torchtext.data import TabularDataset
train_data = TabularDataset(path='/content/gdrive/My Drive/datas/ratings_train.csv', format='csv',fields=[('id', ID), ('document', TEXT), ('label', LABEL)], skip_header=True)
test_data = TabularDataset(path='/content/gdrive/My Drive/datas/ratings_test.csv', format='csv',fields=[('id', ID), ('document', TEXT), ('label', LABEL)], skip_header=True)

In [37]:
class grumodel(nn.Module):
    def __init__(self, embed_dim, vocab_size, hidden_dim, num_layers, batch_size, dropout):
        super(grumodel, self).__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.batch_size = batch_size
        self.sigmoid = nn.Sigmoid()


        self.norm = nn.BatchNorm1d(self.batch_size)
        self.embed = nn.Embedding(self.vocab_size,self.embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.gru = nn.GRU(self.embed_dim, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)
        self.gru2 = nn.GRU(self.hidden_dim*2, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)

        self.mlp1 = nn.Linear(self.hidden_dim*4,self.hidden_dim)
        self.mlp2 = nn.Linear(self.hidden_dim,self.hidden_dim//4)
        self.mlp3 = nn.Linear(self.hidden_dim//4,1)

    def forward(self,x):
        x = self.embed(x)
        x = self.dropout(x)

        x, _ = self.gru(x)
        x, _ = self.gru2(x)
        x = torch.cat((x[:,0,:],x[:,-1,:]),dim=-1)

        x = self.dropout(x)
        x = self.mlp1(x)
        x = self.relu(x)
        x = self.mlp2(x)
        x = self.relu(x)
        x = self.mlp3(x)
        x = self.sigmoid(x).squeeze()
        return x


In [56]:
class grubase(nn.Module):
    def __init__(self, embed_dim, vocab_size, hidden_dim, num_layers, batch_size, dropout):
        super(grubase, self).__init__()
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size 
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.batch_size = batch_size
        self.sigmoid = nn.Sigmoid()


        self.norm = nn.BatchNorm1d(self.batch_size)
        self.embed = nn.Embedding(self.vocab_size,self.embed_dim)
        self.dropout = nn.Dropout(dropout)
        
        self.gru = nn.GRU(self.embed_dim, self.hidden_dim//2, self.num_layers,bidirectional=True,batch_first=True)
        #self.gru2 = nn.GRU(self.hidden_dim, self.hidden_dim, self.num_layers,bidirectional=True,batch_first=True)

        self.mlp1 = nn.Linear(self.hidden_dim*2,1)

    def forward(self,x):
        x = self.embed(x)
        x = self.dropout(x)

        x, _ = self.gru(x)
        x = torch.cat((x[:,0,:],x[:,-1,:]),dim=-1)

        x = self.dropout(x)
        x = self.mlp1(x)
        x = self.sigmoid(x).squeeze()
        return x


In [57]:
def train(model, optimizer, loss_function,train_iter,DEVICE):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        optimizer.zero_grad()

        y_pred = model(x)
        loss = loss_function(y_pred.to(DEVICE).float(), y.float())
        loss.backward()
        optimizer.step()

In [58]:
def getF1(y_pred,y,threshold=0.5):
    
    yp = [1 if x > threshold else 0 for x in y_pred]

    pp = 0
    pf = 0
    fp = 0
    ff = 0
    for i in range(len(y)):
        if y[i] > threshold:
            if yp[i] > threshold: pp += 1
            else: pf += 1
        else:
            if yp[i] < threshold: ff += 1
            else: fp += 1

    precision = pp / (pp + fp + 1e-5) 
    recall = pp / (pp + ff + 1e-5)
    F1 = 2 * precision * recall / (precision + recall + 1e-5)
    acc = (pp + ff) / (len(y) + 1e-5)
    return F1, acc

In [59]:
def evaluate(model, val_iter, loss_function, DEVICE, batch_size, threshold):
    """evaluate model"""
    model.eval()
    total_loss = 0
    total_f1 = 0
    total_acc = 0
    for batch in val_iter:
        x, y = batch.document.to(DEVICE), batch.label.to(DEVICE)
        y_pred = model(x)
        loss = loss_function(y_pred.to(DEVICE).float(), y.float())
        f1, acc = getF1(y_pred,y,threshold)
        total_f1 += f1
        total_acc += acc
        total_loss += loss.item()

    size = len(val_iter.dataset) / batch_size
    avg_loss = total_loss / size
    avg_f1 = total_f1 / size
    avg_acc = total_acc / size
    return avg_loss, avg_f1, avg_acc

In [60]:
print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

훈련 샘플의 개수 : 143682
테스트 샘플의 개수 : 49157


In [61]:
print(vars(train_data[0]))

{'id': '9976970', 'document': ['아', '더빙', '진짜', '짜증나네요', '목소리'], 'label': '0'}


In [62]:
device = torch.device("cuda")
vocab_size = 10000
batch_size = 256
embed_dim = 128
hidden_dim = 256
dropout = 0.6
layers = 1

model = grubase(embed_dim,vocab_size+2,hidden_dim,layers,batch_size,dropout)
model.to(device)
loss = nn.BCELoss()
lr = 0.002
threshold = 0.4

EPOCHS = 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [63]:
TEXT.build_vocab(train_data, min_freq=5, max_size=vocab_size)
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 10002


In [64]:

'''
train_iter, val_iter = data.BucketIterator.splits(
        (train_data, test_data), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False)
'''
from torchtext.data import Iterator
train_loader = Iterator(dataset=train_data, batch_size = batch_size)
val_loader = Iterator(dataset=test_data, batch_size = batch_size)

In [65]:
best_val_f1 = 0
for e in range(1, EPOCHS+1):
    train(model, optimizer, loss,train_loader,device)
    val_loss,val_f1,val_acc = evaluate(model, val_loader, loss, device,batch_size,threshold)

    print("[Epoch: %d] val loss : %1.5f    val acc :%4.3f    F1 :%4.3f" % (e, val_loss, val_acc,val_f1))

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_f1 or val_f1 > best_val_f1:
        print("Best saved")
        torch.save(model.state_dict(), '/content/gdrive/My Drive/GRUmodel/NLP_esemble_model.pt')
        best_val_f1 = val_f1

[Epoch: 1] val loss : 0.43359    val acc :0.803    F1 :0.624
Best saved
[Epoch: 2] val loss : 0.38740    val acc :0.826    F1 :0.633
Best saved
[Epoch: 3] val loss : 0.38005    val acc :0.838    F1 :0.625
[Epoch: 4] val loss : 0.36578    val acc :0.838    F1 :0.643
Best saved
[Epoch: 5] val loss : 0.36083    val acc :0.845    F1 :0.639
[Epoch: 6] val loss : 0.35506    val acc :0.844    F1 :0.643
[Epoch: 7] val loss : 0.35478    val acc :0.847    F1 :0.636
[Epoch: 8] val loss : 0.35239    val acc :0.848    F1 :0.641
[Epoch: 9] val loss : 0.35210    val acc :0.846    F1 :0.641
[Epoch: 10] val loss : 0.34766    val acc :0.846    F1 :0.645
Best saved
[Epoch: 11] val loss : 0.35294    val acc :0.848    F1 :0.644
[Epoch: 12] val loss : 0.34925    val acc :0.845    F1 :0.643
[Epoch: 13] val loss : 0.35409    val acc :0.848    F1 :0.643
[Epoch: 14] val loss : 0.35669    val acc :0.851    F1 :0.637
[Epoch: 15] val loss : 0.34809    val acc :0.849    F1 :0.641
[Epoch: 16] val loss : 0.35585    v