# Install

In [None]:
!pip install transformers
!pip install Datasets
!pip install torch
!pip install datasets evaluate




# Load Data

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset


In [None]:
import pandas as pd

# Cargar el nuevo dataset proporcionado
file_path = "amazon_reviews.csv"
df = pd.read_csv(file_path)

# Mostrar las primeras filas para entender la estructura del nuevo dataset
df.head(), df.columns


(                                              review  label  \
 0  Really disappointed with this Samsung Galaxy S...      0   
 1  After a week of using this Sony WH-1000XM4, I ...      1   
 2  Had this Dell XPS 13 for a month and already h...      0   
 3  Do not waste your money on this AirPods Pro. C...      0   
 4  Just received my Levi's 501 and I'm absolutely...      1   
 
                                             reversed  new_label  
 0  After 2 weeks of using this Dyson V11, I can c...          1  
 1  Really disappointed with this Nike Air Max. Po...          0  
 2  Finally found the perfect Levi's 501! Amazing ...          1  
 3  Just received my Ray-Ban Wayfarer and I'm abso...          1  
 4  Had this Levi's 501 for a month and already ha...          0  ,
 Index(['review', 'label', 'reversed', 'new_label'], dtype='object'))

# Prepare data

In [None]:
from sklearn.model_selection import train_test_split

# Prepare the data for fine-tuning T5
# Format: Input (original review) -> Output (reversed review)
data = df[['review', 'reversed']].drop_duplicates()

# Split the dataset into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



train_data, test_data



(                                                 review  \
 1730  After 2 weeks of using this Project Hail Mary,...   
 3277  After 2 weeks of using this Timberland boots, ...   
 4912  Really disappointed with this Timberland boots...   
 2419  Really disappointed with this Ninja Air Fryer....   
 1173  Had this Dyson V11 for a month and already hav...   
 ...                                                 ...   
 3782  Had this Timberland boots for a month and alre...   
 5211  Really disappointed with this Ninja Air Fryer....   
 5246  Finally found the perfect Nike Air Max! Amazin...   
 5410  After a week of using this Ray-Ban Wayfarer, I...   
 860   Really disappointed with this Nike Air Max. No...   
 
                                                reversed  
 1730  Really disappointed with this Sony WH-1000XM4....  
 3277  Do not waste your money on this Ray-Ban Wayfar...  
 4912  After 2 weeks of using this AirPods Pro, I can...  
 2419  After a month of using this Levi's 

In [None]:
train_dataset = Dataset.from_pandas(train_data)

test_dataset = Dataset.from_pandas(test_data)


# Modeling

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Tokenize the data for training
def preprocess_data(data, tokenizer, max_length=512):
    inputs = ["invert sentiment: " + text for text in data["review"]]
    targets = data["reversed"].tolist()

    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt").input_ids

    # Replace padding token id's of the labels by -100 to ignore them during loss calculation
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

# Preprocess the train and test data
train_dataset = preprocess_data(train_data, tokenizer)
test_dataset = preprocess_data(test_data, tokenizer)

# Prepare the PyTorch Dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

train_dataset = SentimentDataset(train_dataset)
test_dataset = SentimentDataset(test_dataset)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",           # output directory
    evaluation_strategy="epoch",     # evaluate each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,
    save_steps=500,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),  # Enable mixed precision if using GPU
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Start training the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4591,0.337642
2,0.4408,0.336891
3,0.4408,0.336874


TrainOutput(global_step=1794, training_loss=0.5567882510199063, metrics={'train_runtime': 409.6742, 'train_samples_per_second': 35.011, 'train_steps_per_second': 4.379, 'total_flos': 8734283024302080.0, 'train_loss': 0.5567882510199063, 'epoch': 3.0})

# Perplexity

In [None]:
import math
from torch.utils.data import DataLoader

# Función para calcular la pérdida promedio en un conjunto de datos
def calculate_avg_loss(model, dataset, batch_size=8, max_length=512):
    model.eval()  # Poner el modelo en modo evaluación
    dataloader = DataLoader(dataset, batch_size=batch_size)
    total_loss = 0
    total_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            # Enviar los datos al dispositivo adecuado (GPU si está disponible)
            batch = {key: val.to(model.device) for key, val in batch.items()}

            # Calcular la pérdida
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            total_batches += 1

    # Calcular la pérdida promedio
    avg_loss = total_loss / total_batches
    return avg_loss

# Función para calcular perplejidad
def calculate_perplexity(avg_loss):
    return math.exp(avg_loss)

# Calcular la pérdida promedio en el conjunto de prueba
avg_loss = calculate_avg_loss(model, test_dataset)

# Calcular la perplejidad
perplexity = calculate_perplexity(avg_loss)

print(f"Pérdida promedio en el conjunto de prueba: {avg_loss}")
print(f"Perplejidad: {perplexity}")


Pérdida promedio en el conjunto de prueba: 0.3370023409525553
Perplejidad: 1.400742342806064


In [None]:
def generate_text(model, tokenizer, text, max_length=512):
    model.eval()
    input_text = "invert sentiment: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).input_ids

    # Enviar a GPU si está disponible
    if torch.cuda.is_available():
        input_ids = input_ids.to(model.device)

    # Generar texto
    outputs = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ejemplo de uso
example_text = "This phone is amazing! The battery lasts all day and the camera is outstanding."
generated_text = generate_text(model, tokenizer, example_text)
print("Original:", example_text)
print("Generated:", generated_text)


Original: This phone is amazing! The battery lasts all day and the camera is outstanding.
Generated: Really disappointed with this Sony WH-1000XM4. Poor quality. Customer service was unhelpful.


In [None]:
from random import sample

def generate_examples_from_test_data(model, tokenizer, dataset, num_examples=5, max_length=512):
    model.eval()
    examples = []

    # Seleccionar aleatoriamente índices del dataframe
    random_indices = sample(range(len(dataset)), num_examples)

    for idx in random_indices:
        # Obtener la reseña original y su equivalente invertida
        original_review = dataset.iloc[idx]["review"]
        original_inverted = dataset.iloc[idx]["reversed"]

        # Generar la reseña invertida usando el modelo
        input_text = "invert sentiment: " + original_review
        input_ids = tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).input_ids

        # Enviar a GPU si está disponible
        if torch.cuda.is_available():
            input_ids = input_ids.to(model.device)

        # Generar texto
        outputs = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
        generated_inverted = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Almacenar los resultados
        examples.append({
            "Original Review": original_review,
            "Original Inverted": original_inverted,
            "Generated Inverted": generated_inverted
        })

    return examples

num_examples = 5
generated_examples = generate_examples_from_test_data(model, tokenizer, test_data, num_examples=num_examples)


for i, example in enumerate(generated_examples):
    print(f"Example {i+1}:")
    print(f"Original Review: {example['Original Review']}")
    print(f"Original Inverted: {example['Original Inverted']}")
    print(f"Generated Inverted: {example['Generated Inverted']}")
    print("-" * 50)


Example 1:
Original Review: Just received my iRobot Roomba and I'm absolutely loving it! Everything works perfectly. Shipping was fast too!
Original Inverted: Do not waste your money on this Dell XPS 13. Cheaply made. Definitely not as described.
Generated Inverted: Do not waste your money on this Nike Air Max. Overpriced for what you get. Return process is a nightmare.
--------------------------------------------------
Example 2:
Original Review: After 2 weeks of using this Atomic Habits, I can confidently say it's worth every penny. Performance is stellar. Exactly what I needed.
Original Inverted: Do not waste your money on this iPad Pro. Overpriced for what you get. Return process is a nightmare.
Generated Inverted: Do not waste your money on this Atomic Habits. Overpriced for what you get. Return process is a nightmare.
--------------------------------------------------
Example 3:
Original Review: After a week of using this Ray-Ban Wayfarer, I can confidently say it's worth every p

In [None]:

model.save_pretrained("./api_model")
tokenizer.save_pretrained("./api_model")



('./api_model/tokenizer_config.json',
 './api_model/special_tokens_map.json',
 './api_model/spiece.model',
 './api_model/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


model = T5ForConditionalGeneration.from_pretrained("./api_model")
tokenizer = T5Tokenizer.from_pretrained("./api_model")


def test_model(review):
    input_text = f"invert sentiment: {review}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    outputs = model.generate(inputs.input_ids)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


original_review = "This product is amazing! The battery lasts all day."
generated_review = test_model(original_review)

print("Original Review:", original_review)
print("Generated Review:", generated_review)




Original Review: This product is amazing! The battery lasts all day.
Generated Review: Do not waste your money on this Atomic Habits. Overpriced for what you


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Cargar el modelo y el tokenizador desde la carpeta api_model
model = T5ForConditionalGeneration.from_pretrained("./api_model")
tokenizer = T5Tokenizer.from_pretrained("./api_model")

# Función para determinar la polaridad usando palabras clave
def determine_polarity(text):
    positive_keywords = ["amazing", "outstanding", "great", "excellent", "love", "fantastic", "good", "positive", "happy", "enjoy"]
    negative_keywords = ["disappointed", "poor", "bad", "terrible", "hate", "awful", "negative", "unhappy", "sad", "angry"]

    # Contar palabras clave positivas y negativas en el texto
    positive_count = sum(word in text.lower() for word in positive_keywords)
    negative_count = sum(word in text.lower() for word in negative_keywords)

    # Clasificar según la cantidad de palabras clave encontradas
    if positive_count > negative_count:
        return "positive"
    elif negative_count > positive_count:
        return "negative"
    else:
        return "neutral"

# Función para invertir la polaridad
def invert_polarity(original_polarity):
    if original_polarity == "positive":
        return "negative"
    elif original_polarity == "negative":
        return "positive"
    else:
        return "neutral"

# Función para probar el modelo y clasificar polaridades
def generate_text_with_polarity(review):
    input_text = f"invert sentiment: {review}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    outputs = model.generate(inputs.input_ids)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Determinar polaridades
    original_polarity = determine_polarity(review)
    generated_polarity = invert_polarity(original_polarity)

    return {
        "original_text": review,
        "original_polarity": original_polarity,
        "generated_text": generated_text,
        "generated_polarity": generated_polarity
    }

# Probar con ejemplos
review = "This product is terrible! The battery lasts all day."
result = generate_text_with_polarity(review)

# Mostrar los resultados
print("Resultados de la prueba:")
print(result)


Resultados de la prueba:
{'original_text': 'This product is terrible! The battery lasts all day.', 'original_polarity': 'negative', 'generated_text': "After a month of using this iPad Pro, I can confidently say it's worth", 'generated_polarity': 'positive'}


# API


In [None]:
! pip install fastapi uvicorn transformers





In [120]:
from pyngrok import ngrok
import nest_asyncio
import uvicorn


nest_asyncio.apply()


public_url = ngrok.connect(8000)
print(f"Servidor público disponible en: {public_url}")


uvicorn.run("main:app", host="0.0.0.0", port=8000)



Servidor público disponible en: NgrokTunnel: "https://a72a-35-240-141-232.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [1142]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     2806:268:4403:1eb:c85:379f:7171:5000:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     2806:268:4403:1eb:c85:379f:7171:5000:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     2806:268:4403:1eb:c85:379f:7171:5000:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2806:268:4403:1eb:c85:379f:7171:5000:0 - "GET /openapi.json HTTP/1.1" 200 OK




INFO:     2806:268:4403:1eb:c85:379f:7171:5000:0 - "POST /generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1142]
