# Sentiment prediction

Notebook that loads the spanish sentiment model and makes the predictions on new data.


<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQsC2nLGRcUppfA5JayAijhsixjFsvl_ZxXOQ&usqp=CAU">

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers 
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.preprocessing import LabelEncoder

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [28]:
frases = [
    "holaaaa mundo! hoy me siento super feliz!!!",
    "hace las cosas normal",
    "solo quiero llorar y morir"
]
df_test = pd.DataFrame({"comment_text":frases})

In [29]:
df_test

Unnamed: 0,comment_text
0,holaaaa mundo! hoy me siento super feliz!!!
1,hace las cosas normal
2,solo quiero llorar y morir


In [30]:
model = torch.load("spanish_model.pkl")

In [31]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [32]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.comment_text
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'text':text, 
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [33]:
predict_set = CustomDataset(df_test, tokenizer, MAX_LEN)

In [34]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
predict_loader = DataLoader(predict_set, **test_params)

In [35]:
def predict():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    texts = []
    with torch.no_grad():
        for _, data in enumerate(predict_loader, 0):
            texts.append(data["text"])
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)[0]
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return texts, fin_outputs

In [36]:
texts, outputs = predict()
predictions = np.array(outputs).argmax(axis=1)



In [37]:
predictions.shape

(3,)

In [38]:
texts

[['holaaaa mundo! hoy me siento super feliz!!!',
  'hace las cosas normal',
  'solo quiero llorar y morir']]

In [39]:
predictions

array([2, 1, 0])

In [29]:
outputs

[[0.2156260758638382, 0.06996522843837738, 0.9915733933448792],
 [0.1439402550458908, 0.9802049994468689, 0.046880047768354416],
 [0.962117612361908, 0.07049450278282166, 0.4309319853782654]]