# Finetuning an openai model using telegram chat data

https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import openai
import requests
import re
import urllib.request
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
from dotenv import dotenv_values
import time
import json

In [5]:
# create a file called .env.local
# shoudl contain 
# OPENAI_API_KEY=<your_key>
env_vars = dotenv_values('../.env.local')
openai.api_key = env_vars['OPENAI_API_KEY']

## Loading the json from telegram export

In [6]:
# Specify the path to your JSON file
json_file_path = "telegram_result.json"

# Open the JSON file and load its contents
with open(json_file_path) as json_file:
    data = json.load(json_file)

# Now you can work with the loaded JSON data
# For example, you can access values using keys
print(data.keys())

dict_keys(['about', 'personal_information', 'profile_pictures', 'contacts', 'frequent_contacts', 'chats', 'left_chats'])


In [7]:
df = pd.DataFrame(data['chats']['list'][4]['messages'])

In [8]:
df.shape

(19497, 29)

## Cleaning data

In [9]:
cols_keep = ['date', 'from', 'text']
df = df[cols_keep]
df = df[df['text'] != '']
df = df.reset_index()
df = df[['date', 'from', 'text']]
df['date'] = pd.to_datetime(df['date']) 

In [10]:
{"prompt":"Overjoyed with the new iPhone! ->", "completion":" positive"}
{"prompt":"@lakers disappoint for a third straight night https://t.co/38EFe43 ->", "completion":" negative"}

{'prompt': '@lakers disappoint for a third straight night https://t.co/38EFe43 ->',
 'completion': ' negative'}

In [11]:
df = df.sort_values('date')  # sort by date to ensure messages are in order

# Initialize the list to hold your data
data = []

# Iterate over the DataFrame
for i in range(len(df) - 1):
    # Check if the current text is from 'Schilling ;Mathias' and the next is from 'David Furrer'
    if df.iloc[i]['from'] == 'Schilling ;Mathias' and df.iloc[i+1]['from'] == 'David Furrer':
        # If so, add a new dict to data
        data.append({
            "prompt": df.iloc[i]['text'],
            "completion": df.iloc[i+1]['text']
        })

# Write the final output to a file in the desired format
with open('your_training_data.jsonl', 'w') as f:
    for entry in data:
        f.write(json.dumps(entry) + '\n')

## finetune

### Upload training data

In [101]:
openai.File.create(
  file=open("your_training_data.jsonl", "rb"),
  purpose='fine-tune'
)

<File file id=file-i6kn1Dc7sdUxQFlwPU4fMwt2 at 0x7fd3e9f1a4d0> JSON: {
  "bytes": 421044,
  "created_at": 1686243455,
  "filename": "file",
  "id": "file-i6kn1Dc7sdUxQFlwPU4fMwt2",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

### Start finetune

In [102]:
openai.FineTune.create(training_file="file-i6kn1Dc7sdUxQFlwPU4fMwt2") # use id of file from last step

<FineTune fine-tune id=ft-DrWIfprMMOxHdJNoQK9BqTbc at 0x7fd41a3dddf0> JSON: {
  "created_at": 1686243494,
  "events": [
    {
      "created_at": 1686243494,
      "level": "info",
      "message": "Created fine-tune: ft-DrWIfprMMOxHdJNoQK9BqTbc",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-DrWIfprMMOxHdJNoQK9BqTbc",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-6hJji0JKfgBPo7z8LLTibyDx",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 421044,
      "created_at": 1686243455,
      "filename": "file",
      "id": "file-i6kn1Dc7sdUxQFlwPU4fMwt2",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1686243494,
  "validation_files": []
}

In [13]:
# example with davinci, model is curie by default
# openai.FineTune.create(training_file="file-i6kn1Dc7sdUxQFlwPU4fMwt2", model='davinci', suffix='davemaedde') #davemaedde.jsonl curie

## Check progress

In [14]:
# Take id from creation step
df_ = pd.DataFrame(openai.FineTune.list_events(id="ft-xsZ6aNhkAmesOx99IuWYUzpD")['data'])
df_['date_column'] = pd.to_datetime(df_['created_at'], unit='s')
df_

Unnamed: 0,object,level,message,created_at,date_column
0,fine-tune-event,info,Created fine-tune: ft-xsZ6aNhkAmesOx99IuWYUzpD,1686297965,2023-06-09 08:06:05
1,fine-tune-event,info,Fine-tune costs $10.10,1686298043,2023-06-09 08:07:23
2,fine-tune-event,info,Fine-tune enqueued. Queue number: 0,1686298043,2023-06-09 08:07:23
3,fine-tune-event,info,Fine-tune started,1686298189,2023-06-09 08:09:49
4,fine-tune-event,info,Completed epoch 1/4,1686298925,2023-06-09 08:22:05
5,fine-tune-event,info,Completed epoch 3/4,1686299981,2023-06-09 08:39:41
6,fine-tune-event,info,Uploaded model: davinci:ft-personal:davemaedde...,1686300545,2023-06-09 08:49:05
7,fine-tune-event,info,Uploaded result file: file-0LhixyjSiWvj9OrG8gY...,1686300546,2023-06-09 08:49:06
8,fine-tune-event,info,Fine-tune succeeded,1686300546,2023-06-09 08:49:06


## All models

In [142]:
pd.DataFrame(openai.Model.list()['data']).head()

Unnamed: 0,id,object,created,owned_by,permission,root,parent
0,whisper-1,model,1677532384,openai-internal,"[{'id': 'modelperm-KlsZlfft3Gma8pI6A8rTnyjs', ...",whisper-1,
1,babbage,model,1649358449,openai,"[{'id': 'modelperm-49FUp5v084tBB49tC4z8LPH5', ...",babbage,
2,davinci,model,1649359874,openai,"[{'id': 'modelperm-U6ZwlyAd0LyMk4rcMdz33Yc3', ...",davinci,
3,text-davinci-edit-001,model,1649809179,openai,"[{'id': 'modelperm-otmQSS0hmabtVGHI9QB3bct3', ...",text-davinci-edit-001,
4,babbage-code-search-code,model,1651172509,openai-dev,"[{'id': 'modelperm-4qRnA3Hj8HIJbgo0cGbcmErn', ...",babbage-code-search-code,


## See all your finetunes and whether they are ready

In [146]:
pd.DataFrame(openai.FineTune.list()['data'])

Unnamed: 0,object,id,hyperparams,organization_id,model,training_files,validation_files,result_files,created_at,updated_at,status,fine_tuned_model
0,fine-tune,ft-9n5Yinj6XxtuMH3Gq6wH3o6e,"{'n_epochs': 4, 'batch_size': None, 'prompt_lo...",org-6hJji0JKfgBPo7z8LLTibyDx,curie,"[{'object': 'file', 'id': 'file-5FSQhnuZvtWvrM...",[],[],1686243094,1686243162,failed,
1,fine-tune,ft-DrWIfprMMOxHdJNoQK9BqTbc,"{'n_epochs': 4, 'batch_size': 8, 'prompt_loss_...",org-6hJji0JKfgBPo7z8LLTibyDx,curie,"[{'object': 'file', 'id': 'file-i6kn1Dc7sdUxQF...",[],"[{'object': 'file', 'id': 'file-1UIYUHIHyViX9V...",1686243494,1686245931,succeeded,curie:ft-personal-2023-06-08-17-38-50
2,fine-tune,ft-xsZ6aNhkAmesOx99IuWYUzpD,"{'n_epochs': 4, 'batch_size': 8, 'prompt_loss_...",org-6hJji0JKfgBPo7z8LLTibyDx,davinci,"[{'object': 'file', 'id': 'file-i6kn1Dc7sdUxQF...",[],"[{'object': 'file', 'id': 'file-0LhixyjSiWvj9O...",1686297965,1686300546,succeeded,davinci:ft-personal:davemaedde-2023-06-09-08-4...


In [None]:
#openai.FineTune.cancel(id="ft-AF1WoRqd3aJAHsqc9NY7iL8F")

## Query your finetuned model

Also possible in playground

In [135]:
response = openai.Completion.create(
  model="curie:ft-personal-2023-06-08-17-38-50",
  prompt="Say this is a test",
  max_tokens=7,
  temperature=0
)

In [136]:
response.choices[0].text

'true, just a test to see'