<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
import math
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

def complex_function(x, y, z):
    """
    A complex function involving trigonometric operations,
    randomness, and conditional logic.
    """
    if z > 0:
        result = (math.sin(x) + math.cos(y)) * z + random.uniform(-1, 1)
    else:
        result = (math.log(abs(x) + 1) - math.sqrt(abs(y) + 1)) * abs(z) + random.gauss(0, 0.5)
    return result

def generate_complex_dataset(num_samples=1000, filename="complex_dataset.json"):
    """Generates a dataset of input-output pairs for the complex function."""
    dataset = []
    for _ in range(num_samples):
        x = random.uniform(-10, 10)
        y = random.uniform(-10, 10)
        z = random.uniform(-5, 5)
        output = complex_function(x, y, z)
        dataset.append({"x": x, "y": y, "z": z, "output": output})

    with open(filename, "w") as f:
        json.dump({"data": dataset}, f, indent=4)
    print(f"{filename} created.")

def load_dataset(filename):
    """Loads the dataset from a JSON file."""
    try:
        with open(filename, 'r') as f:
            data = json.load(f)['data']
        return data
    except FileNotFoundError:
        print(f"Error: {filename} not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: {filename} is not a valid JSON file.")
        return None

def remove_outliers(data, threshold=3):
    """Removes outliers based on z-score."""
    outputs = np.array([item["output"] for item in data])
    mean = np.mean(outputs)
    std = np.std(outputs)
    filtered_data = [item for item in data if abs((item["output"] - mean) / std) < threshold]
    return filtered_data

def fine_tune_and_evaluate_regression(dataset_file, use_outlier_removal=True):
    """Fine-tunes a linear regression model on the dataset and evaluates it."""
    data = load_dataset(dataset_file)
    if data is None:
        return None

    if use_outlier_removal:
        data = remove_outliers(data)

    X = [[item["x"], item["y"], item["z"]] for item in data]
    y = [item["output"] for item in data]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Regression Mean Squared Error: {mse}")
    return model

def augment_dataset(dataset_file, num_new_samples=100):
    """Augments the dataset with new samples."""
    data = load_dataset(dataset_file)
    if data is None:
      return

    for _ in range(num_new_samples):
        x = random.uniform(-10, 10)
        y = random.uniform(-10, 10)
        z = random.uniform(-5, 5)
        output = complex_function(x, y, z)
        data.append({"x": x, "y": y, "z": z, "output": output})

    with open(dataset_file, "w") as f:
        json.dump({"data": data}, f, indent=4)
    print("Dataset augmented.")

class ComplexDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = f"complex_function(x, y, z): if z > 0: return (sin(x) + cos(y)) * z + random.uniform(-1, 1); else: return (log(abs(x) + 1) - sqrt(abs(y) + 1)) * abs(z) + random.gauss(0, 0.5) x={item['x']} y={item['y']} z={item['z']}"
        output_text = str(item['output'])

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        output_encoding = self.tokenizer(output_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': output_encoding['input_ids'].flatten()
        }

def fine_tune_and_evaluate_t5(dataset_file):
    """Fine-tunes a T5 model on the dataset and evaluates it."""
    data = load_dataset(dataset_file)
    if data is None:
        return

    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    dataset = ComplexDataset(data, tokenizer)

    model = T5ForConditionalGeneration.from_pretrained('t5-small')

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        logging_dir='./logs',
        logging_steps=20,
        report_to="none",
        max_steps=100,


    )

    print("Starting training...")
    print(f"Dataset size: {len(dataset)}")


    #print(f"Dataset: {dataset}")
    #print(f"Batch size: {training_args.per_device_train_batch_size}")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    trainer.train()

    # Inference example
    input_text = "complex_function(x, y, z): if z > 0: return (sin(x) + cos(y)) * z + random.uniform(-1, 1); else: return (log(abs(x) + 1) - sqrt(abs(y) + 1)) * abs(z) + random.gauss(0, 0.5) x=1.0 y=-2.0 z=3.0"
    input_encoding = tokenizer(input_text, return_tensors='pt')

    model.eval()
    with torch.no_grad():
        output = model.generate(input_encoding.input_ids, attention_mask=input_encoding.attention_mask)

    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"T5 Predicted Output: {output_text}")

# Main execution
generate_complex_dataset()
augment_dataset("complex_dataset.json", num_new_samples=500)
fine_tune_and_evaluate_regression("complex_dataset.json")
fine_tune_and_evaluate_t5("complex_dataset.json")

complex_dataset.json created.
Dataset augmented.
Regression Mean Squared Error: 6.564745936770839
Starting training...
Dataset size: 1500


Step,Training Loss
