#Modelo de reconocimiento de emociones usando tokenizacion y redes neuronales LSTM

#carga de datos:

In [8]:
#importamos
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset,TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

#como es jsonl necesitamos sacar cada objeto y ponerlo en un array
data = []
with open('data.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# pasamos el array a dataframe
df = pd.DataFrame(data)



In [None]:
df

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2
...,...,...
416804,that was what i felt when i was finally accept...,1
416805,i take every day as it comes i m just focussin...,4
416806,i just suddenly feel that everything was fake,0
416807,im feeling more eager than ever to claw back w...,1


#Procesamiento y tokenizacion

In [9]:
# separamos el texto y las etiquetas
texts = [item['text'] for item in data]
labels = [item['label'] for item in data]

# cargamos el tokenizador:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#definimos el largo maximo
max_length = 128

# tokenizamos usando BertTokenizer
encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
# representacion numerica del token
input_ids = encodings['input_ids']
# que token les prestamos atencion y que tokens no
attention_masks = encodings['attention_mask']

# convertimos las labels en tensores
label_encoder = LabelEncoder()
labels = torch.tensor(label_encoder.fit_transform(labels))

# Dividimos los set de datos
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, test_masks = train_test_split(attention_masks, test_size=0.2, random_state=42)

#numero de batches
batch_size = 32

# cargamos los datos
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
labels

tensor([0, 0, 1,  ..., 0, 1, 0])

#Definimos la arquitectura

In [11]:
import torch.nn as nn
import torch.nn.functional as F


class LSTM(nn.Module):
    def __init__(self, n_classes):
        #inicializamos la nn
        super(LSTM, self).__init__()
        # Creamos una capa de enbedding con vectores dimension 100
        self.embedding = nn.Embedding(len(tokenizer.vocab), 100)
        #configuraciones basicas (input size, hidden size, layers)
        self.lstm = nn.LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2)
        #creamos una capa densamente conectada
        self.fc = nn.Linear(128, n_classes)

    def forward(self, input_ids, attention_mask):
      #backforward (sacado de internet)
        x = self.embedding(input_ids)
        x, (hn, cn) = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

model = LSTM(len(label_encoder.classes_))

#NO EJECUTAR, ARQUITECTURA USADA COMO MODELO

In [None]:
class MyModel(nn.Module):
	def __init__(self, ...):
	...
	self.lstm = nn.LSTM(embedding_length, hidden_size)
	self.label = nn.Linear(hidden_size, output_size)


	def forward(self):


	h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
	c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())


	output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))


	return self.label(final_hidden_state[-1])


#Entrenamiento

In [24]:
import torch.optim as optim

# usamos adam y crossentropy
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# loop de entrenamiento
def train_model(model, data_loader, criterion, optimizer, device):
    model = model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# loop de evaluacion
def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return total_loss / len(data_loader), correct_predictions.double() / len(data_loader.dataset)

# entrenamos y mandamos al dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

epochs = 10
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = eval_model(model, test_loader, criterion, device)

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}, Val accuracy: {val_acc:.4f}')


Epoch 1/10
Train loss: 0.8616, Val loss: 0.1819, Val accuracy: 0.9256
Epoch 2/10
Train loss: 0.1291, Val loss: 0.1088, Val accuracy: 0.9365
Epoch 3/10
Train loss: 0.1028, Val loss: 0.1003, Val accuracy: 0.9375
Epoch 4/10
Train loss: 0.0960, Val loss: 0.0987, Val accuracy: 0.9384
Epoch 5/10
Train loss: 0.0927, Val loss: 0.0987, Val accuracy: 0.9391
Epoch 6/10
Train loss: 0.0900, Val loss: 0.0957, Val accuracy: 0.9389
Epoch 7/10
Train loss: 0.0881, Val loss: 0.0940, Val accuracy: 0.9401
Epoch 8/10
Train loss: 0.0858, Val loss: 0.0966, Val accuracy: 0.9398
Epoch 9/10
Train loss: 0.0845, Val loss: 0.0943, Val accuracy: 0.9396
Epoch 10/10
Train loss: 0.0834, Val loss: 0.0962, Val accuracy: 0.9388


#Testeo manual

In [31]:
# Funcion que hace encoding con la frase de entrada y predice en base a lo aprendido
def predict_emotion(model, tokenizer, text, max_length, device):
    model = model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)

    return label_encoder.inverse_transform(preds.cpu().numpy())

# Predict on new data
test_text = "He is not dead, he is living in our hearts. "
prediction = predict_emotion(model, tokenizer, test_text, max_length, device)
print(test_text)
if prediction[0] == 0:
  emotion = 'sadness'
elif prediction[0] == 1:
  emotion = 'joy'
elif prediction[0] == 2:
  emotion = 'love'
elif prediction[0] == 3:
  emotion = 'anger'
elif prediction[0] == 4:
  emotion = 'scare'
elif prediction[0] == 5:
  emotion = 'surprise'

print(f'Predicted emotion: {emotion}')

He is not dead, he is living in our hearts. 
Predicted emotion: sadness


In [14]:
# guardar el modelo
model_save_path = 'model.pth'
torch.save(model.state_dict(), model_save_path)

In [21]:
path = 'model.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(label_encoder.classes_))
model.load_state_dict(torch.load(path))
model = model.to(device)