In [1]:
!pip install torch transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m88.0 MB/s[0m eta [36m0:00:

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [4]:
import csv
import unicodedata
import re

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


preguntas = []
respuestas = []

with open('/content/drive/MyDrive/SIS421/EXAMENFINAL/dialogs_expanded.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Ignorar la primera fila (encabezados)
    for row in reader:
        pregunta = normalizeString(row[1])
        respuesta = normalizeString(row[2])
        preguntas.append(pregunta)
        respuestas.append(respuesta)

data = [{'question': q, 'answer': a} for q, a in zip(preguntas, respuestas)]

# Imprimir el primer diálogo
print(data[2])
print(len(data))
dialogues = data[:8000]
print(len(dialogues))

{'question': 'you re asking me out . that s so cute . what s your name again ?', 'answer': 'forget it .'}
139409
8000


In [5]:
max_words = 0

for pair in dialogues:
    input_words = pair['question'].split()  # Dividir la entrada en palabras individuales
    output_words = pair['answer'].split()  # Dividir la salida en palabras individuales
    num_words = max(len(input_words), len(output_words))

    if num_words > max_words:
        max_words = num_words

print("Máximo número de palabras:", max_words)


Máximo número de palabras: 32


In [6]:
import torch
from transformers import BlenderbotSmallTokenizer
tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/964k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/345k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

In [7]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, train_data, tokenizer, max_length):
        self.train_data = train_data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, idx):
        question = self.train_data[idx]['question']
        answer = self.train_data[idx]['answer']

        # Codificación de la pregunta
        question_encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = question_encoding['input_ids'].squeeze()
        attention_mask = question_encoding['attention_mask'].squeeze()

        # Codificación de la respuesta
        answer_encoding = self.tokenizer.encode_plus(
            answer,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        target_ids = answer_encoding['input_ids'].squeeze()
        target_ids_mak = answer_encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': target_ids,
            'attention_target': target_ids_mak
        }


In [8]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

train_size = len(dialogues) * 80 // 100
train_pairs = dialogues[:train_size]
test_pairs = dialogues[train_size:]

# Crear una instancia del conjunto de datos personalizado
max_length = 32
train_dataset = CustomDataset(train_pairs, tokenizer, max_length)
test_dataset = CustomDataset(test_pairs, tokenizer, max_length)

# Crear un DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [9]:
batch = next(iter(train_dataloader))
input_ids = batch['input_ids']
attention_inputs = batch['attention_mask']
target_ids = batch['target_ids']
attention_target = batch['attention_target']

print(len(train_dataloader))
print(input_ids[2])
print(attention_inputs[2])
print(target_ids.shape,target_ids[2])


200
tensor([  15,   68, 1722,   41,   63,    5,   22,   17,   64, 4133,    5,   44,
          17,   66,  193,  164,   20,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
torch.Size([32, 32]) tensor([1322,   19,    5,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])


In [10]:
# Obtén los datos relevantes del dataloader
import torch
train_data = {
    "pregunta_input_ids": input_ids,
    "attention_input": attention_inputs,
    "respuesta_ids": target_ids,
    "attention_respuesta": attention_target
}

# Guarda los datos en un archivo separado
torch.save(train_data, "train_data.pth")


In [11]:
# Cargar los datos desde el archivo
train_data = torch.load("train_data.pth")

# Acceder a los datos
pregunta_input_ids = train_data["pregunta_input_ids"]
attention_inputs = train_data["attention_input"]
respuesta_input_ids = train_data["respuesta_ids"]
attention_respuesta = train_data["attention_respuesta"]
print(attention_respuesta.shape)

torch.Size([32, 32])


In [12]:
# Imprimir las preguntas y respuestas del primer ejemplo en el lote
print("Pregunta:")
print(tokenizer.decode(pregunta_input_ids[0], skip_special_tokens=True))

print("Respuesta:")
print(tokenizer.decode(respuesta_input_ids[0], skip_special_tokens=True))

Pregunta:
well i thought we d start with pronunciation if that s okay with you.
Respuesta:
not the hacking and gagging and spitting part. please.


In [13]:
import torch
from transformers import BlenderbotSmallForConditionalGeneration

model= BlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot-90M")
model

Downloading pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/311 [00:00<?, ?B/s]

BlenderbotSmallForConditionalGeneration(
  (model): BlenderbotSmallModel(
    (shared): Embedding(54944, 512, padding_idx=0)
    (encoder): BlenderbotSmallEncoder(
      (embed_tokens): Embedding(54944, 512, padding_idx=0)
      (embed_positions): BlenderbotSmallLearnedPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-7): 8 x BlenderbotSmallEncoderLayer(
          (self_attn): BlenderbotSmallAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512

In [14]:
import torch
import torch.nn as nn
class EncoderDecoder(torch.nn.Module):
    def __init__(self):
        super(EncoderDecoder, self).__init__()
        self.blenderbot_model = BlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot-90M")
        # Congelar todos los parámetros del modelo preentrenado
        for param in self.blenderbot_model.parameters():
            param.requires_grad = False
        # encoder del modelo
        self.encoder = self.blenderbot_model.model.encoder
        #decoder delo modelo
        self.decoder = self.blenderbot_model.model.decoder
        self.generate = self.blenderbot_model.model.decoder.layernorm_embedding

        self.lm_head = self.blenderbot_model.lm_head

        # Descongelar los parámetros de la última capa del encoder para fine-tuning
        for layer in self.encoder.layers[-1:]:
            for param in layer.parameters():
                param.requires_grad = True

        # Descongelar los parámetros de la última capa del decoder para fine-tuning
        for layer in self.decoder.layers[:-2]:
            for param in layer.parameters():
                param.requires_grad = False

        # Descongelar los parámetros de la última capa (lm_head) para fine-tuning
        #for param in self.lm_head.parameters():
         #   param.requires_grad = True


    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        with torch.no_grad():

            encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
            #print(encoder_outputs.shape)
            decoder_outputs = self.decoder(
                    input_ids=decoder_input_ids,
                    attention_mask=decoder_attention_mask,
                    encoder_hidden_states=encoder_outputs
                ).last_hidden_state


            lm_output = self.lm_head(decoder_outputs)

        return  lm_output


In [15]:
modelo = EncoderDecoder()
decoder_outputs = modelo(input_ids,attention_inputs,target_ids,attention_target)
print(decoder_outputs.shape,decoder_outputs[2])

torch.Size([32, 32, 54944]) tensor([[ -4.0233,  -9.3195,   1.8882,  ...,  -4.0219,  -4.0286,  -4.0173],
        [ -5.9650,  -7.3311,   5.6113,  ...,  -5.9656,  -5.9678,  -5.9638],
        [ -8.0063, -12.6604,   4.8649,  ...,  -8.0042,  -8.0076,  -8.0039],
        ...,
        [ -5.3321,  -9.7528,   0.4743,  ...,  -5.3303,  -5.3352,  -5.3275],
        [ -5.3321,  -9.7529,   0.4743,  ...,  -5.3304,  -5.3353,  -5.3276],
        [ -5.3321,  -9.7529,   0.4743,  ...,  -5.3304,  -5.3353,  -5.3276]])


In [16]:
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F

def fit(model, dataloader,dataloader2 ,epochs=10):
    encoder_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    criterio=torch.nn.CrossEntropyLoss()

    for epoch in range(1, epochs+1):
        model.train()
        train_loss = []
        bar = tqdm(dataloader)
        for batch in bar:
            input_ids = batch['input_ids']
            attention_inputs = batch['attention_mask']
            target_ids = batch['target_ids']
            attention_target = batch['attention_target']

            encoder_optimizer.zero_grad()
            decoder_outputs = model(input_ids, attention_inputs, target_ids, attention_target)
            # Convertir el target en un tensor de probabilidades
            target_probs = F.one_hot(target_ids, num_classes=54944).float()  # ahora tiene forma (62, 32, 54944)
            target_ids=target_probs.requires_grad_()

            # Calcular la pérdida de entropía cruzada
            loss = F.binary_cross_entropy_with_logits(decoder_outputs, target_ids)

            loss.backward()
            encoder_optimizer.step()

            train_loss.append(loss.item())

        avg_loss = np.mean(train_loss)
        # Evaluar el modelo en el conjunto de datos de prueba
        model.eval()
        test_loss = []
        with torch.no_grad():
            for batch in test_dataloader:
                input_ids = batch['input_ids']
                attention_inputs = batch['attention_mask']
                target_ids = batch['target_ids']
                attention_target = batch['attention_target']

                decoder_outputs = model(input_ids, attention_inputs, target_ids, attention_target)
                target_probs = F.one_hot(target_ids, num_classes=54944).float()
                target_ids=target_probs.requires_grad_()
                loss = F.binary_cross_entropy_with_logits(decoder_outputs, target_ids)

                test_loss.append(loss.item())

        # Calcular la pérdida promedio de la evaluación
        avg_test_loss = np.mean(test_loss)

        # Imprimir los resultados de la época actual
        print('Epoch:', epoch, 'Train Loss:', avg_loss, 'Test Loss:', avg_test_loss)


In [17]:
fit(modelo,train_dataloader,test_dataloader,epochs=5)

100%|██████████| 200/200 [16:42<00:00,  5.01s/it]


Epoch: 1 Train Loss: 0.6193762646615505 Test Loss: 0.6637074136734009


100%|██████████| 200/200 [16:37<00:00,  4.99s/it]


Epoch: 2 Train Loss: 0.6216085463762283 Test Loss: 0.6637074136734009


100%|██████████| 200/200 [16:43<00:00,  5.02s/it]


Epoch: 3 Train Loss: 0.623759168535471 Test Loss: 0.6637074136734009


100%|██████████| 200/200 [16:37<00:00,  4.99s/it]


Epoch: 4 Train Loss: 0.6129862958192825 Test Loss: 0.6637074136734009


100%|██████████| 200/200 [16:40<00:00,  5.00s/it]


Epoch: 5 Train Loss: 0.6281099133193493 Test Loss: 0.6637074136734009


In [18]:
import torch
import torch.nn as nn
checkpoint_path = "/content/drive/MyDrive/SIS421/EXAMENFINAL/Checkpoint/modelo_entrenado1.pt"

# Guardar el modelo entrenado
torch.save(modelo.state_dict(), checkpoint_path)

In [19]:
import torch
import torch.nn as nn
# Cargar el modelo entrenado en el futuro
loaded_model = EncoderDecoder()  # Reemplaza "MyModel" con la clase de tu modelo
loaded_model.load_state_dict(torch.load(checkpoint_path))

<All keys matched successfully>

In [20]:
def predict(model, dataloader, tokenizer,input_ids,attention):
    model.eval()
    with torch.no_grad():
            #batch = next(iter(dataloader))
            #input_ids = batch['input_ids']
            #pregunta = input_ids[0]
            #attention_mask = batch['attention_mask']
            #mask =attention_mask[0]

            # Ajustar la forma de los input_ids y attention_mask para el modelo
            #input_ids = pregunta.unsqueeze(0)
            #attention_mask = mask.unsqueeze(0)

            # Generar la entrada para el decodificador
            decoder_input_ids = torch.ones_like(input_ids)
            decoder_attention_mask = attention

            # Pasar los datos al modelo
            with torch.no_grad():
              decoder_outputs = model(input_ids, attention, decoder_input_ids, decoder_attention_mask)
              # Aplicar muestreo multinomial a lm_logits
            temperature = 0.8  # Ajusta este valor según tus preferencias

            # Aplicar muestreo estocástico con temperatura
            softmax_logits = torch.softmax(decoder_outputs / temperature, dim=-1)
            lm_probs_2d = softmax_logits.view(-1, softmax_logits.size(-1))
            sampled_ids = torch.multinomial(lm_probs_2d, num_samples=10)

            # Decodificar las predicciones a texto utilizando el tokenizador
            decoded_outputs = tokenizer.batch_decode(sampled_ids, skip_special_tokens=True)

    return decoded_outputs



In [21]:
    # Tokenizar la pregunta y generar la máscara de atención
    question="well i thought we d start with pronunciation if that s okay with you."
    encoded_inputs = tokenizer(question, return_attention_mask=True,return_tensors="pt")
    input_ids = encoded_inputs["input_ids"]
    attention = encoded_inputs["attention_mask"]

In [22]:
salida = predict(loaded_model,test_dataloader,tokenizer,input_ids,attention)
print(salida[0])

i yeah that you no well correction nope yes how


In [23]:
# Guardar el punto de control al final de cada época
#       torch.save({
#            'epoch': epoch,
#            'model_state_dict': model.state_dict(),
 #           'optimizer_state_dict': encoder_optimizer.state_dict(),
  #          'loss': avg_loss,
   #     }, checkpoint_path)

In [24]:
#checkpoint = torch.load("modelo_checkpoint.pt")
#model.load_state_dict(checkpoint['model_state_dict'])
#encoder_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#start_epoch = checkpoint['epoch'] + 1
