## 1. 데이터 로드

In [1]:
import pandas as pd

df = pd.read_csv("reviews_with_splits_lite.csv")
df

Unnamed: 0,rating,review,split
0,negative,all i can say is that a i had no other option ...,train
1,negative,i went here once when my long time stylist mov...,train
2,negative,i don t know why i stopped here for lunch this...,train
3,negative,did i order the wrong thing ? or maybe it was ...,train
4,negative,i went here for restaurant week . the restaura...,train
...,...,...,...
55995,positive,i liked the small restaurants in downtown gilb...,test
55996,positive,"my husband and i went twice last week , and in...",test
55997,positive,there s an amazing selection of authentic japa...,test
55998,positive,absolutely fun place to hang out . good beer ....,test


In [49]:
df['rating'].value_counts()

rating
negative    28000
positive    28000
Name: count, dtype: int64

데이터 10000개로만 잘라서 train진행

In [2]:
# # 데이터 1000개로만 잘라서 train진행

# # train, val, test 각각의 개수를 지정
# train_size = 8000
# val_size = 1000
# test_size = 1000

# # split 컬럼 값이 'train', 'val', 'test'인 행들만 선택하여 새로운 DataFrame 생성
# train_df = df[df['split'] == 'train'].head(train_size)
# val_df = df[df['split'] == 'val'].head(val_size)
# test_df = df[df['split'] == 'test'].head(test_size)

# # train, val, test DataFrame을 합쳐 최종 DataFrame 생성
# df = pd.concat([train_df, val_df, test_df], ignore_index=True)
# df


In [3]:
df['split'].value_counts()

split
train    39200
val       8400
test      8400
Name: count, dtype: int64

### 데이터 split(train/valid/test)

In [4]:
# 데이터를 다시 train/valid/test로 나눠줌

# train 데이터 
train_df = df[df.split=='train']
train_size = len(train_df)

# valid 데이터 
val_df = df[df.split=='val']
val_size = len(val_df)

# test 데이터 
test_df = df[df.split=='test']
test_size = len(test_df)

In [50]:
# _lookup_dict = {'train': (train_df, train_size),
#                      'val': (val_df, val_size),
#                      'test': (test_df, test_size)}
# _lookup_dict

## 2. Vocabulary

- 정수-토큰 매핑을 수행
- 텍스트 토큰과 클래스 레이블을 정수로 매핑

### 리뷰 Vocabulary

In [6]:
from collections import Counter
import string

# Counter()를 통해 어떤 단어가 얼만큼의 횟수로 들어있는지를 알 수 있다.
word_counts = Counter()
for review in df.review:
#     print(review)
    for word in review.split(" "):
        # word가 .(구두점,punctuation)이 아닐 경우 word에 추가
        if word not in string.punctuation:
            word_counts[word] += 1

word_counts

Counter({'all': 24160,
         'i': 225732,
         'can': 18410,
         'say': 7077,
         'is': 82250,
         'that': 73885,
         'a': 179478,
         'had': 41365,
         'no': 18455,
         'other': 12238,
         'option': 798,
         'then': 10157,
         'to': 185031,
         'go': 19068,
         'with': 53761,
         'time': 21566,
         'warner': 77,
         'cable': 129,
         'for': 83622,
         'my': 70406,
         'internet': 410,
         'service': 20286,
         'n': 75090,
         'nand': 676,
         'b': 806,
         'nif': 1309,
         'you': 57180,
         'don': 13653,
         't': 52863,
         'have': 49296,
         'local': 1963,
         'number': 1224,
         'it': 106288,
         'will': 15840,
         'be': 32217,
         'an': 18342,
         'absolute': 369,
         'nightmare': 206,
         'pay': 3432,
         'your': 14789,
         'bill': 1858,
         'through': 3643,
         'the': 339990,


In [7]:
# add_unk=True를 하면 '<UNK>': 0 토큰을 추가해줌 !

class Vocabulary:
    def __init__(self, add_unk=False):
        self.token_to_idx = {}
        self.idx_to_token = {}
    
#         "UNK" 토큰이 추가되지 않는 경우에는 -1로 설정,
        self.unk_index = -1
        if add_unk:
#         "UNK" 토큰이 추가될 경우에는 UNK에 해당하는 인덱스로 설정,
            self.unk_index = self.add_token('<UNK>') 

    def add_token(self, token):
        
#       만약 해당 토큰이 있으면 토큰 idx만 return
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
            
#       만약 해당 토큰이 없으면 새로운 토큰 만들어줌
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

In [8]:
# cutoff 보다 수가 많은 단어만 vocab에 추가
cutoff = 25

# Vocabulary 객체 생성
# cutoff보다 작으면 unk토큰으로 지정해줄 것이기 때문에 True
review_vocab = Vocabulary(add_unk=True)

# word_counts.items() -> ex) ('all', 24160)
for word, count in word_counts.items():
    if count > cutoff:
        review_vocab.add_token(word)

In [9]:
word_counts.items()



### Vocabulary로 만든 리뷰 데이터 살펴보기

In [10]:
review_vocab.token_to_idx

{'<UNK>': 0,
 'all': 1,
 'i': 2,
 'can': 3,
 'say': 4,
 'is': 5,
 'that': 6,
 'a': 7,
 'had': 8,
 'no': 9,
 'other': 10,
 'option': 11,
 'then': 12,
 'to': 13,
 'go': 14,
 'with': 15,
 'time': 16,
 'warner': 17,
 'cable': 18,
 'for': 19,
 'my': 20,
 'internet': 21,
 'service': 22,
 'n': 23,
 'nand': 24,
 'b': 25,
 'nif': 26,
 'you': 27,
 'don': 28,
 't': 29,
 'have': 30,
 'local': 31,
 'number': 32,
 'it': 33,
 'will': 34,
 'be': 35,
 'an': 36,
 'absolute': 37,
 'nightmare': 38,
 'pay': 39,
 'your': 40,
 'bill': 41,
 'through': 42,
 'the': 43,
 'ring': 44,
 'of': 45,
 'automated': 46,
 'messages': 47,
 'only': 48,
 'get': 49,
 'live': 50,
 'person': 51,
 'after': 52,
 'min': 53,
 'told': 54,
 'they': 55,
 'charge': 56,
 'so': 57,
 'must': 58,
 'repeat': 59,
 'when': 60,
 'comes': 61,
 'each': 62,
 'month': 63,
 'are': 64,
 'lucky': 65,
 'enough': 66,
 'avoid': 67,
 'using': 68,
 'them': 69,
 'suggest': 70,
 'do': 71,
 'went': 72,
 'here': 73,
 'once': 74,
 'long': 75,
 'stylist': 76,
 

In [11]:
review_vocab.idx_to_token

{0: '<UNK>',
 1: 'all',
 2: 'i',
 3: 'can',
 4: 'say',
 5: 'is',
 6: 'that',
 7: 'a',
 8: 'had',
 9: 'no',
 10: 'other',
 11: 'option',
 12: 'then',
 13: 'to',
 14: 'go',
 15: 'with',
 16: 'time',
 17: 'warner',
 18: 'cable',
 19: 'for',
 20: 'my',
 21: 'internet',
 22: 'service',
 23: 'n',
 24: 'nand',
 25: 'b',
 26: 'nif',
 27: 'you',
 28: 'don',
 29: 't',
 30: 'have',
 31: 'local',
 32: 'number',
 33: 'it',
 34: 'will',
 35: 'be',
 36: 'an',
 37: 'absolute',
 38: 'nightmare',
 39: 'pay',
 40: 'your',
 41: 'bill',
 42: 'through',
 43: 'the',
 44: 'ring',
 45: 'of',
 46: 'automated',
 47: 'messages',
 48: 'only',
 49: 'get',
 50: 'live',
 51: 'person',
 52: 'after',
 53: 'min',
 54: 'told',
 55: 'they',
 56: 'charge',
 57: 'so',
 58: 'must',
 59: 'repeat',
 60: 'when',
 61: 'comes',
 62: 'each',
 63: 'month',
 64: 'are',
 65: 'lucky',
 66: 'enough',
 67: 'avoid',
 68: 'using',
 69: 'them',
 70: 'suggest',
 71: 'do',
 72: 'went',
 73: 'here',
 74: 'once',
 75: 'long',
 76: 'stylist',
 

### 평점(rating) Vocabulary

In [12]:
rating_vocab = Vocabulary(add_unk=False)

In [13]:
# 평점 Vocabulary 추가 

# df.rating에는 어차피 negative, positive 밖에 없음 
#  rating -> negative, positive Vocabulary에 추가

for rating in sorted(set(df.rating)):
    rating_vocab.add_token(rating)
    print(rating)

negative
positive


### Vocabulary로 만든 평점 데이터 살펴보기

In [14]:
rating_vocab

<__main__.Vocabulary at 0x7fca6064a100>

In [15]:
rating_vocab.token_to_idx

{'negative': 0, 'positive': 1}

In [16]:
rating_vocab.idx_to_token

{0: 'negative', 1: 'positive'}

## 3. Vectorizer

* 입력 데이터 포인트의 토큰을 순회하면서 각 토큰을 정수로 바꿈
* 반복 과정의 결과는 벡터
* vectorizer에서 만든 벡터는 항상 길이가 같아야 함

In [17]:
# 주어진 토큰에 대응하는 인덱스 반환

def lookup_token(vocabulary_class,token):

# UNK 토큰이 있을 경우
    if vocabulary_class.unk_index >= 0:
#           토큰을 찾아보고 없으면 unk_index 반환, 있으면 해당 토큰의 idx를 반환
        return vocabulary_class.token_to_idx.get(token, vocabulary_class.unk_index)
    else:
        return vocabulary_class.token_to_idx[token]

### 리뷰에 대한 원 핫 인코딩

In [18]:
import numpy as np 

def vectorize(review):

#     전체 리뷰 사이즈만큼을 미리 0으로 채워둠
    one_hot = np.zeros(len(review_vocab.token_to_idx), dtype=np.float32)
    one_hot
    
    for token in review.split(" "):
        
#         토큰이 .(구두점)이 아닐 경우 
#         토큰에 해당되는 인덱스에 1를 부여한 one_hot encoding 만듦
        if token not in string.punctuation:
            one_hot[lookup_token(review_vocab,token)] = 1

    return one_hot

print(vectorize("all i can say is that a i had no other option"))

[0. 1. 1. ... 0. 0. 0.]


In [19]:
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        review = self.reviews[index]
        label = self.labels[index]
        
#         여기다가 vectorize함수 사용해서 review return
        vectorized_review = vectorize(review)
#       label도 숫자로 return
#         vectorized_review = vectorize(review)
        vectorized_label = lookup_token(rating_vocab,label)
    
        return {
            'review': vectorized_review,
            'label': vectorized_label
        }

### 데이터셋 클래스

In [20]:
train_df

Unnamed: 0,rating,review,split
0,negative,all i can say is that a i had no other option ...,train
1,negative,i went here once when my long time stylist mov...,train
2,negative,i don t know why i stopped here for lunch this...,train
3,negative,did i order the wrong thing ? or maybe it was ...,train
4,negative,i went here for restaurant week . the restaura...,train
...,...,...,...
47595,positive,i love this place ! i had them cater my weddin...,train
47596,positive,there s nothing really to say that hasn t alre...,train
47597,positive,i like the food here . it s very flavorful . i...,train
47598,positive,very friendly ! did an awesome job on my elder...,train


In [21]:
# 데이터셋을 인스턴스화 해주어야 로더에 넣어줄 수 있다. 

train_dataset = ReviewDataset(train_df["review"].values, train_df["rating"].values)
train_dataset

valid_dataset = ReviewDataset(val_df["review"].values, val_df["rating"].values)
valid_dataset

test_dataset = ReviewDataset(test_df["review"].values, test_df["rating"].values)
test_dataset


<__main__.ReviewDataset at 0x7fca4404e820>

In [22]:
# 데이터 로더 설정
from torch.utils.data import DataLoader

# drop_last=True -> 배치 사이즈보다 over하면 drop

Traindataloader = DataLoader(dataset=train_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Validdataloader = DataLoader(dataset=valid_dataset, batch_size=512,
                            shuffle=True, drop_last=True)

Testdataloader = DataLoader(dataset=test_dataset, batch_size=512,
                            shuffle=True, drop_last=True)


In [52]:
print(len(train_dataset),len(Traindataloader))

39200 76


In [23]:
for batch_index, batch_dict in enumerate(Traindataloader):
    print(batch_index)
    print(batch_dict)
    
    break
    

0
{'review': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 0., 0.]]), 'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
   

### 모델 정의 및 옵티마이저, loss func 설정

### 모델정의 ReviewClassifier

In [24]:
# nn은 neural network로 torch의 신경망 모듈이다.
import torch.nn as nn

class ReviewClassifier(nn.Module):
    """     num_features (int): 입력 특성 벡트의 크기 """
    
    def __init__(self,num_features):
        
#       torch.nn.Module의 초기화 메서드를 실행하여 해당 클래스의 기능을 상속받음
        super(ReviewClassifier, self).__init__()
    
#       간단한 퍼셉트론 구현
        self.fc1 = nn.Linear(in_features=num_features,
                            out_features=1)
    
    def forward(self, x_in, apply_sigmoid=False):
        # x_in : 입력 데이터
        
#       squeeze는 크기가 1인 차원을 제거한다. 
#       squeeze를 통해서 [batch_size, 1] -> [batch_size]로 단일 스칼라 값으로 만들어줌
            
#       우리는 BCEWithLogitsLoss()를 사용하기 때문에 시그모이드를 적용하지 않는다. 
    
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        
        return y_out
        

In [25]:
len(review_vocab.token_to_idx)

8945

In [26]:
classifier = ReviewClassifier(num_features=len(review_vocab.token_to_idx))
classifier

ReviewClassifier(
  (fc1): Linear(in_features=8945, out_features=1, bias=True)
)

### 옵티마이저, loss function

In [27]:
lr = 0.001
num_epochs = 100

In [28]:
# 옵티마이저
import torch.optim as optim

optimizer = optim.Adam(classifier.parameters(), lr = lr)
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [29]:
# loss function

loss_func = nn.BCEWithLogitsLoss()
loss_func


BCEWithLogitsLoss()

## Train 

In [30]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()
    
#      예측값과 타겟값을 비교하여 일치하는 개수를 계산
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

### 모델 진행상황 저장 및 early stopping

In [31]:
# Train state 초기화 
def make_train_state():
    return {
        'stop_early':False,
        'early_stopping_step':0,
        'early_stopping_best_val':1e8,
        'early_stopping_criteria' : 10,
        'epoch_index' : 0,
        'train_loss': [], 
        'train_acc' :[], 
        'val_loss' : [],
        'val_acc' : [], 
        'test_loss' : [],
        'test_acc' : [],
         
#       모델 저장파일
        'model_filename' : 'model.pth'
    } 


# Train update 
def update_train_state(model, train_state):
    
#   학습시작하면 초기에 모델 저장하기 
    
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(),train_state['model_filename'])
        
#   모델 성능이 향상되면 모델 저장(valid loss가 더 낮아지면)
    elif train_state['epoch_index'] >=1 :
        loss_t = train_state['val_loss'][-1]
#        loss가 나빠지면 early stop step 업데이트
        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step']+=1
            
#        loss가 좋아지면   
        else:
#            early stop step 0으로 다시 초기화        
            train_state['early_stopping_step']=0
    
#           최저 loss이면 모델 저장 
            if loss_t < train_state['early_stopping_best_val']:
                train_state['early_stopping_best_val'] = loss_t
                torch.save(model.state_dict(),train_state['model_filename'])

#       기준점 넘으면 early stop 
        if train_state['early_stopping_step'] >= train_state['early_stopping_criteria']:
            train_state['stop_early'] = True
        
        return train_state


In [32]:
# 모델 진행 상황 함수 초기화
train_state = make_train_state()
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [33]:
train_state

{'stop_early': False,
 'early_stopping_step': 0,
 'early_stopping_best_val': 100000000.0,
 'early_stopping_criteria': 10,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': [],
 'test_acc': [],
 'model_filename': 'model.pth'}

In [34]:
# train_state['epoch_index'] = 1
# train_state

In [35]:
import tqdm

# 에포크만큼
for epoch in tqdm.tqdm(range(num_epochs)):

#     print('epoch',epoch)
#     print(train_state['epoch_index']) 
    train_state['epoch_index'] +=1 

    running_loss = 0.0
    running_acc = 0.0


#     모델을 학습 모드로 설정 -> 드롭아웃 및 배치 정규화와 같은 학습 중에만 적용되는 기법들이 활성화
#     모델을 평가 모드로 전환하려면 classifier.eval()을 사용
    classifier.train()
# 배치 만큼
    for batch_idx, batch_data in enumerate(Traindataloader):



#       1. 옵티마이저 그레디언트 0으로 초기화
        optimizer.zero_grad()
#       2. 모델에 데이터 넣어서 출력받기
        y_pred = classifier(x_in=batch_data['review'].float())
#       3. loss 계산하기
        loss = loss_func(y_pred, batch_dict['label'].float())

#       tensor(0.3190) -> 0.3190, item()으로 스칼라 값만 추출
        loss_t = loss.item()

#       배치에서의 평균 loss 구하기
        running_loss += (loss_t - running_loss) / (batch_idx + 1)

#       4. gradient 계산하기
        loss.backward()

#       5. 옵티마이저 가중치 업데이트
        optimizer.step()

#       Accuracy 계산
        acc_t = compute_accuracy(y_pred, batch_data['label'])
        running_acc += (acc_t - running_acc) / (batch_idx + 1)



    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)


#   valid에 대한 계산

    running_loss = 0.0
    running_acc = 0.0

    classifier.eval() # 모델 파라미터를 수정하지 못 하게 비활성화

    for batch_idx, batch_data in enumerate(Validdataloader):

#       1. 모델의 출력값(y_pred)계산
        y_pred = classifier(x_in=batch_data['review'].float())

#       2. loss 계산
        loss = loss_func(y_pred,batch_data['label'].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

#       3. Accuracy 계산
        acc_t = compute_accuracy(y_pred,batch_data['label'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
    
#     print("val_loss",running_loss)
#     print("val_acc",running_acc)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)

#   전체 loss, acc 저장
    train_state = update_train_state(model=classifier,
                                     train_state=train_state)
#   early stop해라고 했으면 학습 멈추기    
    if train_state['stop_early']:
        break



 32%|█████████████▍                            | 32/100 [01:43<03:39,  3.23s/it]


### Test 진행

In [36]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다

classifier.load_state_dict(torch.load(train_state['model_filename']))
# classifier = classifier.to(args.device)

running_loss = 0.0
running_acc = 0.0

# 가중치 업데이트 하지 못 하게
classifier.eval()

for batch_idx, batch_data in enumerate(Testdataloader):
    
    y_pred = classifier(x_in=batch_data['review'].float())
    loss = loss_func(y_pred,batch_data['label'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_idx + 1)
    
    acc_t = compute_accuracy(y_pred, batch_data['label'])
    running_acc += (acc_t - running_acc) / (batch_idx + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [37]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

테스트 손실: 0.697
테스트 정확도: 49.63


In [38]:
train_state

{'stop_early': True,
 'early_stopping_step': 10,
 'early_stopping_best_val': 0.6853833198547363,
 'early_stopping_criteria': 10,
 'epoch_index': 33,
 'train_loss': [0.6907567781837363,
  0.6909549095128712,
  0.6912096260409606,
  0.6916890167876294,
  0.6919197895024952,
  0.6914446636250143,
  0.6912682628945299,
  0.6916247378838692,
  0.6920785245142483,
  0.6917316772435841,
  0.6920874691323229,
  0.6921353089182,
  0.6921149115813406,
  0.6919270675433309,
  0.6919358278575697,
  0.6929554186369246,
  0.6924106196353308,
  0.6919505102069752,
  0.692575763714941,
  0.6922046855876322,
  0.6921748416988471,
  0.6921838459215668,
  0.6921651449642683,
  0.6924192866212441,
  0.6924900349817777,
  0.692262555423536,
  0.692268860183264,
  0.6929877384712821,
  0.6921016809187438,
  0.6923233537297501,
  0.6927217409798976,
  0.6927890314867621,
  0.6928418553189227],
 'train_acc': [50.21844161184208,
  49.378083881578945,
  48.66622121710527,
  48.920641447368425,
  48.717619243421

### 추론

In [39]:
test_review = "The food was delicious, but it's the worst. I'm never going back ;;"

In [40]:
import re 

# 데이터 정제
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]", r" ", text)
    return text

test_review2 = preprocess_text(test_review)
test_review2

'the food was delicious ,  but it s the worst .  i m never going back   '

In [41]:
# 벡터화 + 텐서화

vectorized_review = torch.tensor(vectorize(test_review2))
vectorized_review

tensor([0., 0., 1.,  ..., 0., 0., 0.])

In [42]:
print(vectorized_review.shape)

torch.Size([8945])


In [53]:
# 첫번째 차원을 1로 만들고, 나머지는 다른 차원으로 알아서 되도록 !
x_data = vectorized_review.view(1, -1)
print(x_data.shape)

torch.Size([1, 8945])


In [44]:
# view 예시

data = torch.randn(3, 4)
print(data.shape)          # torch.Size([3, 4])

data = data.view(1, -1)
print(data.shape)   

torch.Size([3, 4])
torch.Size([1, 12])


In [45]:
# 모델에 test 데이터 넣어주기
result = classifier(x_data)
print(result)

tensor(-0.2070, grad_fn=<SqueezeBackward0>)


In [46]:
# sigmoid 사용
probability_value = torch.sigmoid(result).item()
probability_value

0.4484443962574005

In [47]:
# threshold 적용 후 rating(평점) vocabulary에서 해당 인덱스 가져와서 출력해주기

index = 1
if probability_value < 0.5:
    index = 0
    
prediction = rating_vocab.idx_to_token[index]
print("{} -> {}".format(test_review, prediction))

The food was delicious, but it's the worst. I'm never going back ;; -> negative


In [54]:
print("데이터",test_review)
print("추론 결과",prediction)

데이터 The food was delicious, but it's the worst. I'm never going back ;;
추론 결과 negative
