<a href="https://colab.research.google.com/github/yukyeongmin/NenepBigData/blob/master/Modeling/GloVeLSTMsimple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#구글 드라이브와 코랩 마운트하기
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [228]:
%matplotlib inline
import copy
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm

SEED = 123
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data, datasets
from torchtext.vocab import GloVe
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from gensim.models.word2vec import Word2Vec

In [211]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/bigdata/lemmatizing.csv",index_col = 0)
dataset.head()

Unnamed: 0,review,label
0,act end atmosphere despair mixed happy moment ...,3
1,honest go watch film negative mind anyway hear...,7
2,year gibson braveheart scottish movie false ex...,8
3,change story dad really thing movie go decent ...,1
4,movie potential top line intelligent science f...,7


In [212]:
split = StratifiedShuffleSplit(n_splits=1,test_size = 0.3, random_state=77)
for train_idx, test_idx in split.split(dataset, dataset["label"]):
  trainset = dataset.loc[train_idx]
  testset = dataset.loc[test_idx]

In [213]:
trainset = trainset.reset_index(drop = True)
testset = testset.reset_index(drop = True)

trainset.to_csv("/content/drive/MyDrive/Colab Notebooks/bigdata/trainset.csv",index = False)
testset.to_csv("/content/drive/MyDrive/Colab Notebooks/bigdata/testset.csv", index = False)

In [214]:
TEXT = data.Field(sequential = True,
                  use_vocab = True,
                  tokenize = "spacy",
                  batch_first = True)
LABEL = data.Field(sequential = False,
                   use_vocab = False,
                   batch_first = False,
                   is_target = True)
fields = [('review', TEXT),('label',LABEL)]
train_data, test_data = data.TabularDataset.splits(path = "/content/drive/MyDrive/Colab Notebooks/bigdata/",
                                     train = 'trainset.csv',
                                     test = 'testset.csv', 
                                     format = "csv", fields = fields,
                                     skip_header = True)

In [215]:
print(vars(train_data[0]))

{'review': ['keep', 'hop', 'wake', 'see', 'one', 'top', 'three', 'movie', 'time', 'dream', 'dupe', 'payola', 'ridden', 'viral', 'fest', 'rating', 'entertainment', 'net', 'become', 'example', 'intelligent', 'comic', 'book', 'treatment', 'film', 'example', 'intelligent', 'film', 'dream', 'neither', 'currently', 'rat', 'mile', 'shill', 'figure', 'conceptual', 'shortcoming', 'good', 'movie', 'technical', 'perspective', 'overact', 'entire', 'cast', 'effort', 'bring', 'life', 'snooze', 'even', 'high', 'dollar', 'action', 'scene', 'cgi', 'liven', 'comic', 'relief', 'none', 'whatsoever', 'allay', 'squirm', 'theater', 'seat', 'wonder', 'ever', 'love', 'interest', 'subplot', 'head', 'scratchingly', 'extraneous', 'might', 'interest', 'corporate', 'espionage', 'thriller', 'carry', 'movie', 'save', 'life', 'fact', 'entire', 'cast', 'miscast', 'disagree', 'dream', 'cast', 'lead', 'tell', 'wrong'], 'label': '7'}


In [267]:
TEXT.build_vocab(train_data, vectors = GloVe(name = '6B',dim = 300),
                 max_size = 50000, min_freq = 3)
LABEL.build_vocab(train_data)

In [268]:
vocab_size = len(TEXT.vocab)
n_classes = 9
print("단어 집합의 크기 : {}".format(vocab_size))
print("클래스의 개수 : {}".format(n_classes))

단어 집합의 크기 : 28570
클래스의 개수 : 9


In [269]:
print("임베딩 벡터의 개수와 차원 : {}".format(TEXT.vocab.vectors.shape))

임베딩 벡터의 개수와 차원 : torch.Size([28570, 300])


In [270]:
print(TEXT.vocab.stoi)
print(LABEL.vocab.freqs)

Counter({'5': 17887, '3': 17029, '7': 15872, '0': 12158, '2': 11189, '8': 10582, '1': 7912, '4': 6686, '6': 3375})


# **Split Train and Val Data**

In [271]:
trainset, valset = train_data.split(split_ratio=0.8)

In [272]:
train_iter, val_iter, test_iter = data.BucketIterator.splits((trainset,valset,test_data), batch_size = 64, shuffle = True, sort = False)

In [273]:
print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

훈련 데이터의 미니 배치의 개수 : 1284
테스트 데이터의 미니 배치의 개수 : 688
검증 데이터의 미니 배치의 개수 : 321


In [241]:
batch = next(iter(test_iter))
batch


[torchtext.data.batch.Batch of size 64]
	[.review]:[torch.LongTensor of size 64x391]
	[.label]:[torch.LongTensor of size 64]

In [242]:
batch = next(iter(val_iter))
batch


[torchtext.data.batch.Batch of size 64]
	[.review]:[torch.LongTensor of size 64x343]
	[.label]:[torch.LongTensor of size 64]

In [244]:
batch = next(iter(train_iter))
batch


[torchtext.data.batch.Batch of size 64]
	[.review]:[torch.LongTensor of size 64x384]
	[.label]:[torch.LongTensor of size 64]

In [274]:
is_cuda = torch.cuda.is_available()
print("Cuda Status on system is {}".format(is_cuda))

Cuda Status on system is True


In [275]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [280]:
#RNN 모델
class RNN(nn.Module):
  def __init__(self, embedding_vectors, embedding_size, hidden_size):
    super().__init__()
    self.embedding_vectors = embedding_vectors
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size

    self.embedding_layer = nn.Embedding.from_pretrained(embedding_vectors,freeze = False)
    # LSTM Layer
    self.LSTM = nn.Sequential(
        nn.LSTM(
        input_size = embedding_size,
        hidden_size = hidden_size,
        num_layers = 2,                       # stacked LSTM 3layers
        batch_first = True,
        dropout = 0.4),
    )
    # fully-connected Layer
    self.fc1 = nn.Sequential(
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU()
    )
    self.fc2 = nn.Sequential(
        nn.Linear(hidden_size, 9)
    )
    
  def forward(self, x):
    y = self.embedding_layer(x)
    weight = self._init_state(batch_size = x.size(0))
    y, hidden = self.LSTM(y)
    h_t = y[:,-1,:]
    y = self.fc1(h_t)
    y = self.fc2(y)
    return y

  def _init_state(self, batch_size=1):
    weight = next(self.parameters()).data
    return weight.new(batch_size, self.hidden_size).zero_()

In [278]:
epoch = 10
learning_rate = 0.003
# LSTM using Pytorch
def LSTM_Train(train_loader, test_loader, epoch, learning_rate, no_cuda = False):
  use_cuda = not no_cuda and torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  Model = RNN(TEXT.vocab.vectors,300,300).to(device)
  optimizer = torch.optim.AdamW(Model.parameters(), lr = learning_rate)
  train_L_iters = []
  train_acc_iters = []
  test_L_iters = []
  test_acc_iters = []
  for i in range(1,epoch+1):
    # Train mode
    train_loss = 0
    train_correct = 0
    Model.train()
    for batch_idx, batch in enumerate(train_loader):
      # data와 target을 gpu를 사용할 수 있게 매핑
      data, target = batch.review.to(device), batch.label.to(device)
      # backward하는 과정에서 기존의 gradient값에 새로운 gradient 값을
      # 더해주기때문에 grad값을 매 backward마다 0으로 초기화해줘야 올바른 학습이 가능
      optimizer.zero_grad()
      # 순전파
      output = Model(data)
      # reduction = (default) "mean"
      loss = F.cross_entropy(output,target)
      train_loss += F.cross_entropy(output,target,reduction = 'sum').item()
      pred = output.argmax(dim = 1, keepdim = True)
      train_correct += pred.eq(target.view_as(pred)).sum().item()
      # 역전파
      loss.backward()
      optimizer.step()
    # 에포크별 평균 트레인 로스 계산
    train_loss /= len(train_loader.dataset)
    train_L_iters.append(train_loss)
    # 에포크별 평균 정확도 계산
    accuracy =  100 * train_correct / len(train_loader.dataset)
    train_acc_iters.append(accuracy)    
    # torch.no_grad() 테스트 할 때, 위에서 계산한 grad를 더이상 업데이트 하지 않음.
    with torch.no_grad():
      Model.eval()
      test_loss = 0
      test_correct = 0
      for batch in test_loader:
        data, target = batch.review.to(device), batch.label.to(device)
        output = Model(data)
        test_loss += F.cross_entropy(output, target, reduction = 'sum').item()
        pred = output.argmax(dim = 1, keepdim = True)
        test_correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    test_L_iters.append(test_loss)
    accuracy = 100 * test_correct / len(test_loader.dataset)
    test_acc_iters.append(accuracy)
    if i % 1 == 0:
      print("progress = {}, current loss = {}, current device = {}".format(100 * i / epoch, train_loss, torch.cuda.get_device_name()))
      print("test loss = {} test accuracy = {}\n".format(test_loss, accuracy))

  return train_L_iters, train_acc_iters, test_L_iters, test_acc_iters
    

In [279]:
train_L_iters, train_acc_iters, test_L_iters, test_acc_iters = LSTM_Train(train_iter,val_iter, epoch, learning_rate)

progress = 10.0, current loss = 2.078195162067419, current device = Tesla T4
test loss = 1.8871750303642325 test accuracy = 22.20761515240043

progress = 20.0, current loss = 1.5248393848072601, current device = Tesla T4
test loss = 1.0742087134345386 test accuracy = 62.601032232934074

progress = 30.0, current loss = 0.8815274230228194, current device = Tesla T4
test loss = 0.8650292220000968 test accuracy = 70.45963579705911

progress = 40.0, current loss = 0.6739643170801316, current device = Tesla T4
test loss = 0.8670203884732863 test accuracy = 70.93193105463044

progress = 50.0, current loss = 0.5281204405447585, current device = Tesla T4
test loss = 0.9177634723695649 test accuracy = 70.7371701236732

progress = 60.0, current loss = 0.41668284047322074, current device = Tesla T4
test loss = 0.9969180349784047 test accuracy = 70.32330314538903

progress = 70.0, current loss = 0.34158494317179844, current device = Tesla T4
test loss = 1.0595412020767336 test accuracy = 69.9386503