In [113]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import re
import string
from collections import Counter
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torchutils as tu
from torchmetrics.classification import BinaryAccuracy
from rnn_preprocessing import data_preprocessing, get_words_by_freq, padding, preprocess_single_string

In [114]:
from joblib import load
vocab_to_int = load('./vocab.joblib')

In [115]:
sentence = 'this film could be better but still ok'
sentence = preprocess_single_string(sentence, 256, vocab_to_int)

In [117]:
sentence.shape

torch.Size([256])

In [118]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 64
N_LAYERS = 2
SEQ_LEN = 256
VOCAB_SIZE = len(vocab_to_int)+1 

In [119]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [120]:
class RNNNet(nn.Module):    
    '''
    vocab_size: int, размер словаря (аргумент embedding-слоя)
    emb_size:   int, размер вектора для описания каждого элемента последовательности
    hidden_dim: int, размер вектора скрытого состояния
    batch_size: int, размер batch'а

    '''
    
    def __init__(self, 
                 vocab_size: int, 
                 emb_size: int, 
                 hidden_dim: int, 
                 seq_len: int, 
                 n_layers: int = 1) -> None:
        super().__init__()
        
        self.seq_len  = seq_len 
        self.emb_size = emb_size 
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, self.emb_size)
        self.rnn_cell  = nn.RNN(self.emb_size, self.hidden_dim, batch_first=True, num_layers=n_layers, bidirectional=True)
        self.linear    = nn.Sequential(
            nn.Dropout(),
            nn.Linear(self.hidden_dim * self.seq_len*2, 256),
            nn.Dropout(),
            nn.Linear(256, 1)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        x = self.embedding(x.to(device))
        output, _ = self.rnn_cell(x)

        output = output.contiguous().view(output.size(0), -1)
        out = self.linear(output.squeeze(0))
        return out

In [121]:
model = RNNNet(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, SEQ_LEN, N_LAYERS)

model.load_state_dict(torch.load('./rnn_model_epoch_2.pt'))

<All keys matched successfully>

In [123]:
text = """This movie is a real gem. It is hard to find fault with it. Hanks is excellent in a role that clearly calls for him to suppress his natural slant toward humor. He is Paul Edgecomb; Tom Hanks is nowhere to be found. Yet he gives Edgecomb just the right flavor. One cannot find a single weak cast member! Michael Jeter should have got an Oscar. Michael Clark Duncan put just the right shading on his huge character to make him vulnerable and sympathetic.

Flawlessly shot on perfect period sets, the whole production binds together to bring the extraordinary story into the realm of a believable and compelling study of human injustice and charity."""
reviews_int = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
features = padding([reviews_int], SEQ_LEN)
features = torch.tensor(features)

In [124]:
model.eval()
model.to(device)
m = nn.Sigmoid()
m(model(features.to(device)))

tensor([0.9783], device='cuda:0', grad_fn=<SigmoidBackward0>)