# Data Processing

## Intent classification 테스크를 위한 데이터 처리

In [2]:
import os
import sys
import json
import torch
import random

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from tqdm import trange

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

#pip install sentencepiece
#pip install pytorch-crf
from src.dataset import Preprocessing
from src.model import EpochLogger, MakeEmbed

class MakeDataset:
    def __init__(self):
        
        self.intent_label_dir = "./data/dataset/intent_label.json"
        self.intent_data_dir = "./data/dataset/intent_data.csv"
        
        self.intent_label = self.load_intent_label()
        self.prep = Preprocessing()
    
    def load_intent_label(self):
        ''' 미리 만들어 둔 예측해야할 intent label 로드'''
        f = open(self.intent_label_dir, encoding="UTF-8") 
        intent_label = json.loads(f.read())
        self.intents = list(intent_label.keys())
        return intent_label
    
    def tokenize(self, sentence):
        ''' 띄어쓰기 단위로 tokenize 적용'''
        return sentence.split()
    
    def tokenize_dataset(self, dataset):
        ''' Dataset에 tokenize 적용'''
        token_dataset = []
        for data in dataset:
            token_dataset.append(self.tokenize(data))
        return token_dataset

    def make_intent_dataset(self, embed):
        ''' intent 분류를 위한 Dataset 생성'''
        intent_dataset = pd.read_csv(self.intent_data_dir) # 데이터 로딩

        labels = [self.intent_label[label] for label in intent_dataset["label"].to_list()] # label 
            
        intent_querys = self.tokenize_dataset(intent_dataset["question"].tolist()) # 사용자 발화 tokenize
        
        dataset = list(zip(intent_querys, labels)) # (사용자 발화, intent) 형태로 가공
        intent_train_dataset, intent_test_dataset = self.word2idx_dataset(dataset, embed) # word2index
        return intent_train_dataset, intent_test_dataset
    
    def word2idx_dataset(self, dataset ,embed, train_ratio = 0.8):
        embed_dataset = []
        question_list, label_list = [], []
        flag = True
        random.shuffle(dataset) #  훈련용과 검증용으로 나눌때 intent 편형이 나타나지 않도록 데이터 셔플
        for query, label in dataset :
            q_vec = embed.query2idx(query) # 사용자 발화 index화
            q_vec = self.prep.pad_idx_sequencing(q_vec) # 사용자 발화 최대길이까지 padding

            question_list.append(torch.tensor([q_vec]))
            label_list.append(torch.tensor([label]))

        x = torch.cat(question_list)
        y = torch.cat(label_list)

        # 학습용과 검증용으로 나누기
        x_len = x.size()[0]
        y_len = y.size()[0]
        if(x_len == y_len):
            train_size = int(x_len*train_ratio)
            
            train_x = x[:train_size]
            train_y = y[:train_size]

            test_x = x[train_size+1:]
            test_y = y[train_size+1:]
            
            # TensorDataset으로 감싸기
            '''
             PyTorch의 TensorDataset은 tensor를 감싸는 Dataset입니다.

             인덱싱 방식과 길이를 정의함으로써 이것은 tensor의 첫 번째 차원을 따라 반복, 인덱스, 슬라이스를 위한 방법을 제공합니다.

             훈련할 때 동일한 라인에서 독립 변수와 종속 변수에 쉽게 접근할 수 있습니다.
            '''
            train_dataset = TensorDataset(train_x,train_y)
            test_dataset = TensorDataset(test_x,test_y)
            
            return train_dataset, test_dataset
            
        else:
            print("ERROR x!=y")
            

In [3]:
dataset = MakeDataset()

In [4]:
intent_dataset = pd.read_csv(dataset.intent_data_dir)

In [5]:
intent_dataset.head()

Unnamed: 0,question,label
0,야 먼지 알려주겠니,dust
1,아니 먼지 정보 알려주세요,dust
2,그 때 미세먼지 어떨까,dust
3,그 때 먼지 좋으려나,dust
4,미세먼지 어떨 것 같은데,dust


In [6]:
intent_dataset.groupby(['label']).count() 

Unnamed: 0_level_0,question
label,Unnamed: 1_level_1
dust,4997
restaurant,4997
travel,4999
weather,4999


In [8]:
embed = MakeEmbed()
embed.load_word2vec()

batch_size = 128

intent_train_dataset, intent_test_dataset = dataset.make_intent_dataset(embed)

# 한번의 iter당 Batch size의 x, y를 제공한다.
train_dataloader = DataLoader(intent_train_dataset, batch_size=batch_size, shuffle=True) 

test_dataloader = DataLoader(intent_test_dataset, batch_size=batch_size, shuffle=True)

# Convolutional Neural Networks for Sentence Classification
## * Yoon Kim, New York University
### tensorflow code : https://github.com/SeonbeomKim/TensorFlow-TextCNN/blob/master/TextCNN.py

In [9]:
class textCNN(nn.Module):
    
    def __init__(self, w2v, dim, kernels, dropout, num_class):
        super(textCNN, self).__init__()
        # Word2vec으로 미리 학습해둔 임베딩 적용
        vocab_size = w2v.size()[0]
        emb_dim = w2v.size()[1]
        self.embed = nn.Embedding(vocab_size+2, emb_dim)
        self.embed.weight[2:].data.copy_(w2v)
        # self.embed.weight.requires_grad = False # 임베딩 레이어 학습 유무
        
        # 윈도우 사이즈가 다른 각각의 conv layer 를 nn.ModuleList로 저장
        # nn.Conv2d(in_channels, out_channels, kernel_size)
        self.convs = nn.ModuleList([nn.Conv2d(1, dim, (w, emb_dim)) for w in kernels])
        #Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        #FC layer
        self.fc = nn.Linear(len(kernels)*dim, num_class)
        
    def forward(self, x):
        emb_x = self.embed(x)
        emb_x = emb_x.unsqueeze(1)

        con_x = [conv(emb_x) for conv in self.convs] # 각 사이즈 별 결과를 list로 저장

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x] # 각 사이즈별 max_pool 결과 저장
        
        fc_x = torch.cat(pool_x, dim=1) # concat하여 fc layer의 입력 형태로 만듬
        
        fc_x = fc_x.squeeze(-1) # 차원 맞추기

        fc_x = self.dropout(fc_x)
        logit = self.fc(fc_x)
        return logit

# 모델의 가중치 저장을 위한 코드
def save(model, save_dir, save_prefix, epoch):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, epoch)
    torch.save(model.state_dict(), save_path)

In [10]:
weights = embed.word2vec.wv.vectors # word2vec weight
weights = torch.FloatTensor(weights)

num_class = len(dataset.intent_label) 
model = textCNN(weights, 256, [3,4,5], 0.5, num_class)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [11]:
intent_train_dataset.tensors

(tensor([[ 53, 144,  43,  ...,   0,   0,   0],
         [  8, 186,   4,  ...,   0,   0,   0],
         [ 75,  11, 172,  ...,   0,   0,   0],
         ...,
         [ 93,  26, 647,  ...,   0,   0,   0],
         [326, 608,  17,  ...,   0,   0,   0],
         [698,   8, 170,  ...,   0,   0,   0]]),
 tensor([2, 3, 2,  ..., 2, 1, 3]))

In [12]:
model

textCNN(
  (embed): Embedding(1481, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=768, out_features=4, bias=True)
)

# Training

In [14]:
epoch = 10
prev_acc = 0
save_dir = "./data/pretraining/1_intent_clsf_model/"
save_prefix = "intent_clsf"
for i in range(epoch):
    steps = 0
    model.train() 
    #for data in train_dataloader:
    with tqdm(train_dataloader, unit="batch") as tepoch:
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]
            logit = model.forward(x)
            
            optimizer.zero_grad()
            loss = F.cross_entropy(logit, target) # loass function
            loss.backward()
            optimizer.step()

            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            tepoch.set_postfix(loss=loss.item(), accuracy= accuracy.numpy())
            
    model.eval() # weight 업데이트 금지
    steps = 0
    accuarcy_list = []
    #for data in test_dataloader:
    with tqdm(test_dataloader, unit="batch") as tepoch:
        for data in tepoch:
            tepoch.set_description(f"Epoch {i}")
            x = data[0]
            target = data[1]

            logit = model.forward(x)
            loss = F.cross_entropy(logit, target)
            corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()
            accuracy = 100.0 * corrects/x.size()[0]
            accuarcy_list.append(accuracy.tolist())
            
            tepoch.set_postfix(loss=loss.item(), accuracy= sum(accuarcy_list)/len(accuarcy_list))
    
    # epoch 당 검증 셋의 정확도를 계산하고 이전 정확도 보다 높으면 저장     
    acc = sum(accuarcy_list)/len(accuarcy_list)
    if(acc>prev_acc):
        prev_acc = acc
        save(model, save_dir, save_prefix+"_"+str(round(acc,3)), i)

Epoch 0: 100%|██████████| 125/125 [01:32<00:00,  1.36batch/s, accuracy=99.17355, loss=0.393]  
Epoch 0: 100%|██████████| 32/32 [00:03<00:00,  9.20batch/s, accuracy=98.1, loss=6.76e-5]


# Load & Test

In [15]:
model.load_state_dict(torch.load("./data/pretraining/save/1_intent_clsf_model/intent_clsf_97.217_steps_33.pt"))

model.eval()

textCNN(
  (embed): Embedding(1481, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 256, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 256, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 256, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=768, out_features=4, bias=True)
)

In [16]:
q = "제주도 미세먼지 알려줘"

x = dataset.prep.pad_idx_sequencing(embed.query2idx(dataset.tokenize(q)))

x = torch.tensor(x)
f = model(x.unsqueeze(0))

intent = dataset.intents[torch.argmax(f).tolist()]

print("발화 : " + q)
print("의도 : " + intent)

발화 : 제주도 미세먼지 알려줘
의도 : dust
