# Sentiment analysis prediction -Binary Classification with a pretrained model
##**Load previously saved model and make prediction**

By Dina Garnier-Oeliarisoa

The sentiment classification model was trained from [here](https://colab.research.google.com/drive/1b_KWldrGqPeH2IpO35s-76q2spCoIcFM?usp=sharing) 

##**Import libraries**


In [None]:
# Libraries
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/d8/5144b0712f7f82229a8da5983a8fbb8d30cec5fbd5f8d12ffe1854dcea67/transformers-4.4.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 21.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 44.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=59ff6f9db6

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [None]:
# LOAD THE ENTIRE MODEL
model = torch.load('/content/drive/MyDrive/Colab Notebooks/colab_nlp/output/fr_perso_tut6-perso-all_model-small.pt')
# run on CPU
#model = torch.load('/content/drive/MyDrive/Colab Notebooks/colab_nlp/output/fr_perso_tut6-perso-all_model-small.pt').cpu()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
len(tokenizer.vocab)

30522

In [None]:
#tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
tokens = tokenizer.tokenize('Bonjour, comment vas-tu?')

In [None]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[14753, 23099, 2099, 1010, 7615, 12436, 2015, 1011, 10722, 1029]


In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

In [None]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

In [None]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [None]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

In [None]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [None]:
MYTEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
MYSENTIMENT = data.LabelField(dtype = torch.float)

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
#predict_sentiment(model, tokenizer, "This film is terrible")
predict_sentiment(model, tokenizer, "Ce film était horrible")

0.3710169792175293

In [None]:
#predict_sentiment(model, tokenizer, "This film is great")
predict_sentiment(model, tokenizer, "Super film, je me suis régalée!")

0.7007127404212952

In [None]:
#predict_sentiment(model, tokenizer, "Very good movie")
predict_sentiment(model, tokenizer, "Scénario intéressant mais j'étais restée sur ma faim.")

0.24773943424224854

In [None]:
predict_sentiment(model, tokenizer, "Scénario intéressant.")

0.8721023797988892