# Librerías

In [20]:
import numpy
import pandas as pd
import openai
from openai import OpenAI
import csv
import json

# Formateo de csv a jsonl y guardado de los datos

## Formateo de los datos a forma correcta a partir de csv

In [5]:
def csv_to_txt(csv_file, txt_file):
    with open(csv_file, 'r', newline='', encoding='utf-8') as csvfile:
        csvreader = csv.DictReader(csvfile)
        
        with open(txt_file, 'w', encoding='utf-8') as txtfile:
            for row in csvreader:
                # Write prompt and completion to the text file in the desired format
                txtfile.write(f"user:: {row['prompt']}\n")
                txtfile.write(f"assistant:: {row['completion']}\n")
                txtfile.write("-\n")

# Example usage:
csv_file_path = 'Modelos/archivos_jsonl/prompts_example.csv'
txt_file_path = 'Modelos/archivos_jsonl/prompts_example.txt'
csv_to_txt(csv_file_path, txt_file_path)


In [6]:
with open('Modelos/archivos_jsonl/prompts_example.txt') as f:
    text = [line for line in f]

In [7]:
text[:10]

['user:: Generate a scene in Street with the characters: Homer Simpson, Voice.\n',
 "assistant:: (Street: ext. street - establishing - night) (Street: ext. street - night) Voice: Hey, you. What do you think you're doing? Homer Simpson: Uh-oh. Voice: Hey, hey! Come back here.\n",
 '-\n',
 'user:: Generate a scene in Car with the characters: Marge Simpson, Homer Simpson.\n',
 "assistant:: (Car: int. car - night) Marge Simpson: Ooo, careful, Homer. Homer Simpson: There's no time to be careful. Homer Simpson: We're late.\n",
 '-\n',
 'user:: Generate a scene in Springfield Elementary School.\n',
 'assistant:: (Springfield Elementary School: Ext. springfield elementary school - establishing - night)\n',
 '-\n',
 'user:: Generate a scene in Auditorium with the characters: Homer Simpson, Dewey Largo, Todd Flanders, Seymour Skinner, Children, Bart Simpson, JANEY, Marge Simpson.\n']

#### Cambio de formato al que debe ser:
{
  
  "messages": [

    { "role": "system", "content": "You are an assistant that occasionally misspells words" },

    { "role": "user", "content": "Tell me a story." },

    { "role": "assistant", "content": "One day a student went to schoool." }

  ]

}

In [8]:
def formatear_ejemplo(lista_mensajes, system_message=None):
    messages = []

    # Incluir primero el mensaje de sistema
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    # Iterar por la lista de mensajes
    for mensaje in lista_mensajes:
        # Separar los mensajes por los dos puntos y el espacio
        partes = mensaje.split(':: ', maxsplit=1)

        #Controlar si alguna línea no cumple el patrón
        if len(partes) < 2:
            continue

        # Identificar el rol y content
        role = partes[0].strip()
        content = partes[1].strip()

        # Formatear el mensaje
        message = {
            "role": role,
            "content": content
        }

        #Agregar el mensaje a la lista
        messages.append(message)

    # Crear diccionario final
    dict_final = {
        "messages": messages
    }

    return dict_final

In [9]:
system_message = "You are a skilled scriptwriter for TV shows, much like The Simpsons."

dataset = []

ejemplo = []
for line in text:
    if line == '-\n':
        ejemplo_formateado = formatear_ejemplo(lista_mensajes=ejemplo,
                                                system_message=system_message)

        dataset.append(ejemplo_formateado)
        ejemplo = []
        continue

    ejemplo.append(line)

### Validar formato y errores

In [10]:
# Format error checks
from collections import defaultdict
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


## Conteo de Tokens y estimación de precio

In [11]:
import tiktoken
import numpy as np
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribución de {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"media / mediana: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


In [12]:
# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num de ejemplos sin el system message:", n_missing_system)
print("Num de ejemplos sin el user message:", n_missing_user)
print_distribution(n_messages, "num_mensajes_por_ejemplo")
print_distribution(convo_lens, "num_total_tokens_por_ejemplo")
print_distribution(assistant_message_lens, "num_assistant_tokens_por_ejemplo")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} ejemplos que excedan el límite de tokenes de 4096, ellos serán truncados durante el fine-tuning")


Num de ejemplos sin el system message: 0
Num de ejemplos sin el user message: 0

#### Distribución de num_mensajes_por_ejemplo:
min / max: 3, 3
media / mediana: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribución de num_total_tokens_por_ejemplo:
min / max: 42, 4092
media / mediana: 327.85231841329403, 201.0
p5 / p95: 67.0, 692.0

#### Distribución de num_assistant_tokens_por_ejemplo:
min / max: 6, 3985
media / mediana: 271.28276601447334, 147.0
p5 / p95: 24.0, 621.8000000000011

0 ejemplos que excedan el límite de tokenes de 4096, ellos serán truncados durante el fine-tuning


In [13]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 4
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"El conjunto de datos tiene ~{n_billing_tokens_in_dataset} tokens que serán cargados durante el entrenamiento")
print(f"Por defecto, entrenarás para {n_epochs} epochs en este conjunto de datos")
print(f"Por defecto, serás cargado con ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("Revisa la página para estimar el costo total")

El conjunto de datos tiene ~3669651 tokens que serán cargados durante el entrenamiento
Por defecto, entrenarás para 2 epochs en este conjunto de datos
Por defecto, serás cargado con ~7339302 tokens
Revisa la página para estimar el costo total


Como los precios de GPT-3.5-turbo son para entreno 8 USD$ por cada millón de tokens:

In [14]:
precio_por_token = 8/1e6
print(f"Estimado de costo: {n_epochs * n_billing_tokens_in_dataset * precio_por_token}$")

Estimado de costo: 58.714416$


## Guardar datos formateados

In [16]:
def save_to_jsonl(dataset, file_path):
    with open(file_path, 'w') as file:
        for ejemplo in dataset:
            json_line = json.dumps(ejemplo, ensure_ascii=False)
            file.write(json_line + '\n')

In [17]:
# Guardar train full
save_to_jsonl(dataset, 'Modelos/archivos_jsonl/prompts_example.jsonl')

# Cargar datos a API de Open AI

In [25]:
# client = OpenAI(api_key='...')

# client.files.create(
#     file=open("Modelos/archivos_jsonl/prompts_example.jsonl", "rb"),
#     purpose="fine-tune"
# )

# Comenzar proceso de Fine Tuning

Esto se ha realizado en la web de Open AI.