## 5-4. 데이터 전처리 및 Pre-Trained Embedding Vector를 이용한 Vocabulary생성

In [2]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

현재 내 python=3.7, pytorch version=1.4.0과 맞는 torchtext는 0.5.0 
but, 실습하기 위해서는 torchtext==0.3.1 필요

pip install torchtext==0.3.1

In [3]:
import torchtext
torchtext.__version__

'0.3.1'

## Reading Data(IMDb data)

In [4]:
# Data Setting
TEXT = data.Field(batch_first = True, #Batch Size를 Data Shape Axis의 가장 앞으로 설정하는 옵션
                  fix_length = 500, #sentence의 길이를 미리 제한
                  tokenize=str.split, #tokenize를 설정하는 옵션. 기본값은 띄어쓰기 기반의 파이썬의 string.split함수
                  pad_first=True,#fix_length대비 짧은 문장의 경우 padding을 해야 하는데 padding을 앞에서 줄 것인지에 대한 옵션
                  pad_token='[PAD]',#padding에 대한 특수 token설정
                  unk_token='[UNK]')#token dict에 없는 token이 나왔을 경우 해당 token을 표현하는 특수 token

LABEL = data.LabelField(dtype=torch.float)#가져올 데이터에 대한 type설정 옵션

train_data, test_data = datasets.IMDB.splits(text_field = TEXT, 
                                             label_field = LABEL)

In [5]:
# Data Length
print(f'Train Data Length : {len(train_data.examples)}') #data.examples : 데이터의 개수 확인
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 25000
Test Data Length : 25000


In [6]:
# Data Fields
train_data.fields

{'text': <torchtext.data.field.Field at 0x25ab6061748>,
 'label': <torchtext.data.field.LabelField at 0x25ab6061708>}

In [7]:
# Data Sample
print('---- Data Sample ----')
print('Input : ')
print(' '.join(vars(train_data.examples[1])['text']),'\n') #vars():데이터 값을 직접 확인
print('Label : ')
print(vars(train_data.examples[0])['label'])

---- Data Sample ----
Input : 
Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the str

In [8]:
print(' '.join(vars(train_data.examples[24999])['text']),'\n')
print(' '.join(vars(test_data.examples[24999])['text']),'\n')

This is one of the dumbest films, I've ever seen. It rips off nearly ever type of thriller and manages to make a mess of them all.<br /><br />There's not a single good line or character in the whole mess. If there was a plot, it was an afterthought and as far as acting goes, there's nothing good to say so Ill say nothing. I honestly cant understand how this type of nonsense gets produced and actually released, does somebody somewhere not at some stage think, 'Oh my god this really is a load of shite' and call it a day. Its crap like this that has people downloading illegally, the trailer looks like a completely different film, at least if you have download it, you haven't wasted your time or money Don't waste your time, this is painful. 

David Bryce's comments nearby are exceptionally well written and informative as almost say everything I feel about DARLING LILI. This massive musical is so peculiar and over blown, over produced and must have caused ruptures at Paramount in 1970. It c

## Pre-processing Data

In [9]:
import re
#전처리 과정-특수문자, 소문자화, <br>텍스트
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) #'<[^>]*>'부분을 "<br />"로 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence)#특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리, \s+:하나 이상의 공백 문자 시퀀스
    if input_sentence:
        return input_sentence
    
for example in train_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

for example in test_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

In [10]:
print(vars(example)['text'])

['david', "bryce's", 'comments', 'nearby', 'are', 'exceptionally', 'well', 'written', 'and', 'informative', 'as', 'almost', 'say', 'everything', 'i', 'feel', 'about', 'darling', 'lili', 'this', 'massive', 'musical', 'is', 'so', 'peculiar', 'and', 'over', 'blown', 'over', 'produced', 'and', 'must', 'have', 'caused', 'ruptures', 'at', 'paramount', 'in', '1970', 'it', 'cost', '22', 'million', 'dollars', 'that', 'is', 'simply', 'irresponsible', 'darling', 'lili', 'must', 'have', 'been', 'greenlit', 'from', 'a', 'board', 'meeting', 'that', 'said', 'hey', 'we', 'got', 'that', 'pink', 'panther', 'guy', 'and', 'that', 'sound', 'of', 'music', 'gal', 'lets', 'get', 'this', 'too', 'and', 'handed', 'over', 'a', 'blank', 'cheque', 'the', 'result', 'is', 'a', 'hybrid', 'of', 'gigi', 'zeppelin', 'half', 'a', 'sixpence', 'some', 'mgm', '40s', 'song', 'and', 'dance', 'numbers', 'of', 'a', 'style', 'daisies', 'and', 'boaters', 'so', 'hopelessly', 'old', 'fashioned', 'as', 'to', 'be', 'like', 'musical', 

David Bryce's comments nearby are exceptionally well written and informative as almost say everything I feel about DARLING LILI.

=>'david', "bryce's", 'comments', 'nearby', 'are', 'exceptionally', 'well', 'written', 'and', 'informative', 'as', 'almost', 'say', 'everything', 'i', 'feel', 'about', 'darling', 'lili'

## making vocab & setting embedding

In [11]:
model_config = {'emb_type' : 'fasttext', 'emb_dim' : 300}

In [12]:
# fasttext pre-trained 
#미리 작업해둔 Field에 build_vocab을 이용해 text data와 label data의 vocab을 손쉽게 만듦
'''TEXT.build_vocab(train_data,
                 min_freq = 2,#vocab에 해당하는 token에 최소한으로 등장하는 횟수 제한
                 max_size = None,#전체 vocab size제한
                 #vectors = f"glove.6B.{model_config['emb_dim']}d"
                 #pre-trained vector를 가져와 vocab에 세팅. 원하는 임베딩을 정해 string형태로 설정
                )

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab)'''

#! wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec' 

In [None]:
# fasttext pre-trained 
from torchtext.vocab import Vectors
fasttext_vectors = Vectors('./wiki.en.vec')

TEXT.build_vocab(train_data,
                 min_freq = 2, 
                 max_size = None,
                 vectors = fasttext_vectors)

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab)

In [13]:
# Vocabulary Info
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10:
        break    
    print('\t', k, v)

print('---------------------------------')

# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)
    
# Check embedding vectors
#TEXT.vocab.vectors.shape()

Vocab Size : 51956
Vocab Examples : 
	 [UNK] 0
	 [PAD] 1
	 the 2
	 and 3
	 a 4
	 of 5
	 to 6
	 is 7
	 in 8
	 it 9
---------------------------------
Label Size : 2
Lable Examples : 
	 neg 0
	 pos 1


## spliting validation data & making data iterator

In [14]:
import random
# Spliting Valid set
train_data, valid_data = train_data.split(random_state = random.seed(0),
                                          split_ratio=0.8)

In [15]:
model_config['batch_size'] = 30

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#BucketIterator를 이용해 쉽게 batch data 생성
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data, valid_data, test_data), 
                                                                           batch_size=model_config['batch_size'],
                                                                           device=device)

In [16]:
# Check batch data
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 30]
	[.text]:[torch.LongTensor of size 30x500]
	[.label]:[torch.FloatTensor of size 30]
tensor([[    1,     1,     1,  ...,  1262,    22,   119],
        [    1,     1,     1,  ...,  5769,     3,  4838],
        [    1,     1,     1,  ...,  3035,    76,  4462],
        ...,
        [    1,     1,     1,  ..., 15469,  5258,     0],
        [    1,     1,     1,  ...,    16,   916,   467],
        [    1,     1,     1,  ...,    24,   233,  2630]])
tensor([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
        0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.])


## Pytorch에서 RNN, LSTM, GRU을 이용한 모델 만들기

## Making Model

In [17]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext':#input:token index값을 가진 vector. vocab_size*embedding_dimension 행렬을 만들어 학습
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],#num_embeddings: vocab size
                                    embedding_dim = model_config['emb_dim'],#embedding_dim:원하는 embedding dim 설정
                                    _weight = TEXT.vocab.vectors)#pre-trained vector를 embedding 행렬의 initial value로 설정. 이 옵션이 없을 경우 정규분포에서 생성한 값을 initial value로 설정하고 학습
        else:
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],
                                    embedding_dim = model_config['emb_dim'])
        
        self.bidirectional = model_config['bidirectional']
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type'] 

        self.RNN = nn.RNN(input_size = model_config['emb_dim'],#입력받을 data의 크기. embedding dimension설정
                          hidden_size = model_config['hidden_dim'],#hidden layer의 dimension설정
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])#data의 제일 처음 axis에 batch_size가 오도록 설정
        
        self.LSTM= nn.LSTM(input_size = model_config['emb_dim'],
                           hidden_size = model_config['hidden_dim'],
                           dropout = model_config['dropout'],
                           bidirectional = model_config['bidirectional'],
                           batch_first = model_config['batch_first'])
        
        self.GRU = nn.GRU(input_size = model_config['emb_dim'],
                          hidden_size = model_config['hidden_dim'],
                          dropout = model_config['dropout'],
                          bidirectional = model_config['bidirectional'],
                          batch_first = model_config['batch_first'])
    
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction,#분류문제를 푸는 task를 할 예정이므로 class에 대한 score를 생성하기 위해 fc layer를 1개 만들어 통과시킴.
                            model_config['output_dim'])#추후 시그모이드가 없는 이유는 추후 loss function에 포함돼 있기 때문
        
        self.drop = nn.Dropout(model_config['dropout'])

    def forward(self, x):
        
        emb = self.emb(x) 
        # emb : (Batch_Size, Max_Seq_Length, Emb_dim)

        if self.model_type == 'RNN':
            output, hidden = self.RNN(emb) 
        elif self.model_type == 'LSTM':
            output, (hidden, cell) = self.LSTM(emb)
        elif self.model_type == 'GRU':
            output, hidden = self.GRU(emb)
        else:
            raise NameError('Select model_type in [RNN, LSTM, GRU]')
        
        # output : (Batch_Size, Max_Seq_Length, Hidden_dim * num_direction) 
        # hidden : (num_direction, Batch_Size, Hidden_dim)
        # hidden의 경우, batch_first 옵션이 안먹는 문제가 있음
        
        last_output = output[:,-1,:] #output shape은 token의 위치를 설명하는 두번째 차원에서 마지막 값을 가져와 사용

        # last_output : (Batch_Size, Hidden_dim * num_direction)
        return self.fc(self.drop(last_output))

## checking feed-forward

In [18]:
model_config.update(dict(batch_first = True,
                         model_type = 'RNN',#RNN, LSTM, GRU 중 하나 선택
                         bidirectional = True, #양방향 선택
                         hidden_dim = 128,
                         output_dim = 1, #IMDb분류는 binary classification이므로
                         dropout = 0))

model = SentenceClassification(**model_config).to(device)

predictions = model.forward(sample_for_check.text).squeeze()

loss_fn = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum()/len(correct)
    return acc

loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

print(predictions)
print(loss, acc)

tensor([-0.3307,  0.4489,  0.0787, -0.1998, -0.2811, -0.2645, -0.5108, -0.3324,
        -0.2175,  0.4181, -0.2781,  0.0740, -0.4978,  0.7202, -0.2802, -0.5279,
        -0.2119, -0.1287, -1.1439, -0.3933,  0.0530, -0.3471, -0.8656, -0.0407,
         0.1841, -0.4621, -0.1746, -0.1644, -0.1857, -0.1422],
       grad_fn=<SqueezeBackward0>)
tensor(0.7204, grad_fn=<BinaryCrossEntropyWithLogitsBackward>) tensor(0.4667)


## training

In [19]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) 

In [20]:
def evaluate(model, iterator, loss_fn):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## bi-RNN

In [21]:
model_config['model_type'] = 'RNN'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-RNN_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.6333 | Train Acc : 0.6351
	 Epoch : 0 | Valid Loss : 0.6239 | Valid Acc : 0.6356
	 Epoch : 1 | Train Loss : 0.6192 | Train Acc : 0.6449
	 Epoch : 1 | Valid Loss : 0.6446 | Valid Acc : 0.6201
	 Epoch : 2 | Train Loss : 0.6027 | Train Acc : 0.6623
	 Epoch : 2 | Valid Loss : 0.6562 | Valid Acc : 0.6108

In [None]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

## bi-LSTM

In [None]:
model_config['model_type'] = 'LSTM'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

In [None]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

## bi-GRU

In [None]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

In [None]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

## inference

In [None]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
model.load_state_dict(torch.load(f"./{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}.pt"))

In [None]:
def predict_sentiment(model, sentence):
    model.eval()
    indexed = TEXT.numericalize(TEXT.pad([TEXT.tokenize(PreProcessingText(sentence))]))
    input_data = torch.LongTensor(indexed).to(device)
    prediction = torch.sigmoid(model(input_data))
    return prediction.item()

In [None]:
test_sentence = 'this movie is FUN'
predict_sentiment(model = model, sentence = test_sentence)