# [튜토리얼2 - LSTM 모델을 활용한 IMDB 데이터 감성 예측]

### jupyter notebook 단축키

- ctrl+enter: 셀 실행   
- shift+enter: 셀 실행 및 다음 셀 이동   
- alt+enter: 셀 실행, 다음 셀 이동, 새로운 셀 생성
- a: 상단에 새로운 셀 만들기
- b: 하단에 새로운 셀 만들기
- dd: 셀 삭제(x: 셀 삭제)
- y: Code로 변경
- m: Markdown으로 변경

## 1. 모듈 불러오기

#### import '불러올 패키지명' as '그 패키지를 파이썬에서 사용할 이름'

In [None]:
from google.colab import drive
drive.mount('content/gdrive/')
import os
os.chdir('/content/gdrove/My Drive/Day3/hands-on/3일차_RNN2/')

In [None]:
#### computer vision을 다룰 때에는 torchvision제공
#### text를 다룰 때에는 torchtext제공
from torchtext import data,datasets
# pip install torchtext

from torchtext.vocab import GloVe,FastText,CharNGram
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torch
from torchtext.datasets.imdb import IMDB
import sys

is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

## 2. 데이터

#### Pandas Datareader 사용: 야후에서 제공하는 API사용

### 2.1. 데이터 불러오기

#### IMDB영화 데이터셋 내려받기 
#### https://www.kaggle.com/orgesleka/imdbmovies

- X: large movie review sentence dataset
- y: good(긍정) / bad(부정)

In [None]:
# 데이터셋 다운
# 토큰화 수행
# 데이터셋을 학습,시험 데이터로 분할

TEXT = data.Field(lower=True,fix_length=200,batch_first=False)  # X fix_length: sequence length
LABEL = data.Field(sequential=False,)                           # y

In [None]:
train, test = IMDB.splits(TEXT, LABEL)

In [None]:
# 어휘 구축
# 어휘 객체 생성에 train객체 전달
'''
- charngram.100d
- fasttext.en.300d
- fasttext.simple.300d
- glove.42B.300d
- glove.840B.300d
- glove.twitter.27B.25d
- glove.twitter.27B.50d
- glove.twitter.27B.100d
- glove.twitter.27B.200d
- glove.6B.50d
- glove.6B.100d
- glove.6B.200d
- glove.6B.300d
'''

TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300),max_size=10000,min_freq=10)
LABEL.build_vocab(train,)

In [None]:
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

In [None]:
LABEL.vocab.freqs

In [None]:
d = vars(TEXT.vocab)
d.keys()

In [None]:
TEXT.vocab.vectors

In [None]:
TEXT.vocab.vectors.shape

### 2.2. 벡터 배치 생성하기

#### 단어를 인덱스 번호로 대체

In [None]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, device=-1)
train_iter.repeat = False
test_iter.repeat = False

In [None]:
batch = next(iter(train_iter))
batch.text.shape

In [None]:
batch.label.shape

## 3. 모델 정의

In [None]:
class IMDBRnn(nn.Module):
    
    def __init__(self,vocab,hidden_size,n_cat,bs=1,nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
        self.nl = nl
        self.e = nn.Embedding(n_vocab,hidden_size)               #[200,32,100]
        self.rnn = nn.LSTM(hidden_size,hidden_size,nl)
        self.fc2 = nn.Linear(hidden_size,n_cat)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self,inp):
        bs = inp.size()[1]
        if bs != self.bs:
            self.bs = bs
        e_out = self.e(inp)
        h0 = c0 = Variable(e_out.data.new(*(self.nl,self.bs,self.hidden_size)).zero_())
        rnn_o,_ = self.rnn(e_out,(h0,c0)) 
        rnn_o = rnn_o[-1]
        fc = F.dropout(self.fc2(rnn_o),p=0.8)
        return self.softmax(fc)

In [None]:
n_vocab = len(TEXT.vocab)
n_hidden = 100

In [None]:
len(train_iter.dataset)

## 4. 모델 학습

In [None]:
model = IMDBRnn(n_vocab, n_hidden, 3, bs=32)
model = model.cuda()

optimizer = optim.Adam(model.parameters(),lr=1e-3)

def fit(epoch,model,data_loader,phase='training',volatile=False):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
        volatile=True
    running_loss = 0.0
    running_correct = 0
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text , batch.label
        if is_cuda:
            text,target = text.cuda(),target.cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)
        loss = F.nll_loss(output,target)
        
        running_loss += F.nll_loss(output,target,size_average=False).data
        preds = output.data.max(dim=1,keepdim=True)[1]
        running_correct += preds.eq(target.data.view_as(preds)).cpu().sum()
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = 100. * running_correct.item()/len(data_loader.dataset)
    
    print(f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
    return loss,accuracy

In [None]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

for epoch in range(1,5):

    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

In [None]:
%%time
for epoch in range(1,5):

    epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

## Validation loss가 크다면?

#### 과대적합이 발생한 상황이다.
#### 1. 은닉층 차원 줄이기
#### 2. 시퀀스 길이 늘리기
#### 3. 더 작은 학습률 적용시키기