In [3]:
# !pip install tiktoken

In [49]:
import os
import openai
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Insert your OpenAI API Key: ")

In [2]:
import json
import subprocess

import tiktoken
import pandas as pd

# Funções

In [38]:
def num_tokens_from_string(string, model):
    encoding = tiktoken.model.encoding_for_model("ada")
    num_tokens = len(encoding.encode(string))
    return num_tokens


def format_data(title, test, category, max_size=100):
    article = title + ". " + test
    words = article.split(" ")
    n_words = len(words)

    prompt = " ".join(words[:max_size]) + "\n\n###\n\n"
    completion = " " + category
    return {"prompt": prompt, "completion": completion}


def update_json(title, test, category, file, max_size=100):
    try:
        data = format_data(title, test, category, max_size=max_size)
        json_data = json.dumps(data, ensure_ascii=False)
        file.write(json_data)
        file.write("\n")
    except:
        pass


def system(args):
    x = subprocess.run(args.split(" "), capture_output=True)
    out = x.stdout.decode()
    err = x.stderr.decode()
    if err != "":
        raise Exception(err)
    return out


def estimate_price(lines, model,  mode="use"):

    training_price = {
        "ada": 0.0004,
        "babbage": 0.0006,
        "curie": 0.0030,
        "davinci": 0.0300,
    }

    usage_price = {
        "ada": 0.0016,
        "babbage": 0.0024,
        "curie": 0.0120,
        "davinci": 0.1200,
    }

    token_counter = 0
    for element in lines:
        for key, value in element.items():
            token_counter+=num_tokens_from_string(value,model)
    
    price = usage_price[model] if mode == "use" else training_price[model]

    total_price = token_counter * price / 1000

    print(f"Fine tuning '{mode}' using {model} costs ${price} per 1000 tokens")

    print(f"Estimated price: ${total_price}")

    return total_price


# Data

#### Preparing json to fine tuning

In [43]:
df = pd.read_csv("data/news-sample.csv")

with open("data/news-sample.json", "w+") as file:
    for title, text, category in df.values:
        update_json(title, text, category, file, max_size=100)

#### Estimating Training Price

In [44]:
with open("data/news-sample.json", "r") as file:
    _lines = [json.loads(_line.replace("\n","")) for _line in file.readlines()]

model = "ada"
estimate_price(_lines, model,  mode="train")

Fine tuning 'train' using ada costs $0.0004 per 1000 tokens
Estimated price: $0.062174400000000005


0.062174400000000005

<<Antes do treino, meu billing estava em $8.47>>

<<Depois do treino, meu billing estava em $8.67>>

# Fine Tuning

---
### Instale a lib da OpenAI
```bash
pip install --upgrade openai
```

---
### Exporte sua chave

```bash
export OPENAI_API_KEY="<OPENAI_API_KEY>"
```

---
### Gere o JSON
Rode os passos descritos acima até a geração do arquivo `data/news-sample.json`

---
### Preparação da base de dados
vamos usar a ferramenta da OpenAI para validar os dados. Esse processo irá gerar um novo dataset no formato jsonl
```bash
$ openai tools fine_tunes.prepare_data -f data/news-sample.json 
```



```
Analyzing...

- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL format
- Your file contains 699 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- All prompts end with suffix `\n\n###\n\n`

Based on the analysis we will perform the following actions:
- [Necessary] Your format `JSON` will be converted to `JSONL`
- [Recommended] Would you like to split into training and validation set? [Y/n]: y


Your data will be written to a new JSONL file. Proceed [Y/n]: y

Wrote modified files to `data/news-sample_prepared_train.jsonl` and `data/news-sample_prepared_valid.jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "data/news-sample_prepared_train.jsonl" -v "data/news-sample_prepared_valid.jsonl" --compute_classification_metrics --classification_n_classes 7

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\n\n###\n\n` for the model to start generating completions, rather than continuing with the prompt.
Once your model starts training, it'll approximately take 19.11 minutes to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.

```

---


### Crie o Modelo

```bash
openai api fine_tunes.create \
    -t "data/news-sample_prepared_train.jsonl" \
    -v "data/news-sample_prepared_valid.jsonl" \
    -m ada \
    --suffix "classificador-de-noticias" \
    --classification_n_classes 7 \
    --compute_classification_metrics
```

```bash
Upload progress: 100%|███████████████████████████████████████████████████████████████████████████████| 369k/369k [00:00<00:00, 266Mit/s]
Uploaded file from data/news-sample_prepared_train.jsonl: file-9B2SVlFmewHRmnggo2a6lmHy
Upload progress: 100%|███████████████████████████████████████████████████████████████████████████████| 92.6k/92.6k [00:00<00:00, 85.9Mit/s]
Uploaded file from data/news-sample_prepared_valid.jsonl: file-83dVYnyrrclNRgFyaphuxENy
Created fine-tune: ft-VQyisXETWQCVKfZP0yxQzoTv
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2023-06-06 22:54:20] Created fine-tune: ft-VQyisXETWQCVKfZP0yxQzoTv
```

---
### Lista de Modelos Existentes
Podemos consultar a lista de modelos que já criamos com o comando

```bash
openai api fine_tunes.list
```

```json
{
  "data": [
    {
      "created_at": 1686102860,
      "fine_tuned_model": "ada:ft-personal:classificador-de-noticias-2023-06-07-02-04-38",
      "hyperparams": {
        "batch_size": 1,
        "classification_n_classes": 7,
        "compute_classification_metrics": true,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-VQyisXETWQCVKfZP0yxQzoTv",
      "model": "ada",
      "object": "fine-tune",
      "organization_id": "org-yzPWxEUSZLqRz4MXVmn603J3",
      "result_files": [
        {
          "bytes": 120271,
          "created_at": 1686103480,
          "filename": "compiled_results.csv",
          "id": "file-svx0KScsjZmK4Fi8eqx7rXi5",
          "object": "file",
          "purpose": "fine-tune-results",
          "status": "processed",
          "status_details": null
        }
      ],
      "status": "succeeded",
      "training_files": [
        {
          "bytes": 369338,
          "created_at": 1686102856,
          "filename": "data/news-sample_prepared_train.jsonl",
          "id": "file-9B2SVlFmewHRmnggo2a6lmHy",
          "object": "file",
          "purpose": "fine-tune",
          "status": "processed",
          "status_details": null
        }
      ],
      "updated_at": 1686103480,
      "validation_files": [
        {
          "bytes": 92551,
          "created_at": 1686102859,
          "filename": "data/news-sample_prepared_valid.jsonl",
          "id": "file-83dVYnyrrclNRgFyaphuxENy",
          "object": "file",
          "purpose": "fine-tune",
          "status": "processed",
          "status_details": null
        }
      ]
    }
  ],
  "object": "list"
}
```

Vamos precisar do id do modelo; no caso do exemplo acima, seria `ft-VQyisXETWQCVKfZP0yxQzoTv`

* **STATUS**: Para consultar o status, use o comando
```bash
openai api fine_tunes.get -i <YOUR_FINE_TUNE_JOB_ID>
```


* **CANCEL**: Para cancelar o treinamento, use o comando
```bash
openai api fine_tunes.cancel -i <YOUR_FINE_TUNE_JOB_ID>
```

* **RESULTS**: obtenha o ID do results_file; no exemplo é  `file-svx0KScsjZmK4Fi8eqx7rXi5`. Baixe os resultados do treinamento com

In [51]:
import os


In [53]:
import requests

RESULTS_FILE_ID = "file-svx0KScsjZmK4Fi8eqx7rXi5"

url = 'https://api.openai.com/v1/files/{}/content'.format(RESULTS_FILE_ID)
headers = {
    'Authorization': 'Bearer {}'.format(os.environ['OPENAI_API_KEY'])
}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    with open('data/results.csv', 'wb') as file:
        file.write(response.content)
        print('Arquivo "data/results.csv" salvo com sucesso.')
else:
    print('Falha ao fazer a solicitação. Código de status:', response.status_code)


Arquivo "results.csv" salvo com sucesso.


# Usando o Modelo Treinado

In [76]:
import os
import numpy as np
import openai
from getpass import getpass

model_name = "ada:ft-personal:classificador-de-noticias-2023-06-07-02-04-38"

os.environ["OPENAI_API_KEY"] = getpass("Insert your OpenAI API Key: ")
openai.api_key = os.environ["OPENAI_API_KEY"] 

In [86]:
test = {
    'prompt': 'São Paulo confirma Rogério Ceni fora do clássico contra o Corinthians. Rogério Ceni não vai enfrentar o Corinthians neste domingo (22), na Arena Corinthians, às 17h, pelo Campeonato Brasileiro. O goleiro ainda se recupera de ruptura no ligamento do tornozelo direito e não foi relacionado pelo técnico Milton Cruz para clássico paulista.  A última vez que Ceni entrou em campo foi na segunda partida das semifinais da Copa do Brasil, contra o Santos, no dia 28 de setembro, quando o São Paulo perdeu por 3 a 1 e foi eliminado. Naquele jogo, o goleiro se machucou após dividida com\n\n###\n\n',
    'completion': ' esporte'
}

result = openai.Completion.create(
    model=model_name,
    prompt=test['prompt'],
    max_tokens=10,
    stop="\n",
)

predicted = result["choices"][0]["text"]
predicted = list(filter(lambda x: x != "",predicted.split(" ")))[0]
predicted

'esporte'

In [91]:

size = 100

# Importando uma amostra para teste
df_test = pd.read_csv("data/news-test-sample.csv")

results = []
for i in range(size):
    print(i)
    try:
        title, test, category = df_test.sample(1).iloc[0].values

        formated_data = format_data(title, test, category, max_size=100)

        prompt = formated_data["prompt"]

        real_category = formated_data['completion']

        fine_tuned_model = model_name

        result = openai.Completion.create(
            model=fine_tuned_model,
            prompt=prompt,
            max_tokens=10,
            stop="\n",
        )

        predicted = result["choices"][0]["text"]

        predicted = list(filter(lambda x: x != "",predicted.split(" ")))[0]


        results.append(
            {
                "real_category": real_category.replace(" ", ""),
                "predicted_category": predicted.replace(",", "").replace(".", ""),
            }
        )

        print(100*"-")
        print(prompt)
        print(f"predicted = {predicted}")
        print(f"real_category = {real_category}")
    
    except:
        pass

0
1
----------------------------------------------------------------------------------------------------
Shell diz que seus acionistas votaram a favor da compra da britânica BG. Os acionistas da Shell votaram a favor da compra da britânica BG em reunião realizada nesta quarta-feira (27), com a aprovação de 83,03% dos participantes, informou a companhia anglo-holandesa.  O próximo passo será a votação dos acionistas da BG, em reunião marcada para quinta-feira (28), explicou a empresa em nota. Caso também seja aprovado pela britânica, o negócio deverá ser concluído em 15 de fevereiro, segundo a Shell.  Com a conclusão do negócio, a Shell terá grande importância no setor de petróleo brasileiro. Isso porque

###


predicted = mercado
real_category =  mercado
2
----------------------------------------------------------------------------------------------------
Irritado com o apoio do PDT ao Planalto, Cristovam migrará para PPS. Após 11 anos no PDT, o senador Cristovam Buarque (DF) resolveu 

In [92]:

df_results = pd.DataFrame(results)
print("Accuracy = {:.0f}%".format(100*np.mean(df_results["real_category"] == df_results["predicted_category"])))

Accuracy = 83%
