In [96]:
import pandas as pd
from itertools import chain
import nltk
import json
from collections import defaultdict

Prepar a dataset to fit into GPT's fine-tuning

# Data

## Raw Molière Q and A

In [None]:
df = pd.read_csv("/Users/frweber/tmp/dialogues_moliere.csv")

texts = list(chain(*[nltk.sent_tokenize(t.replace(u'\xa0', u' ')) for t in df.text.to_list()]))

texts_last_sentences = [nltk.sent_tokenize(text) for text in texts]

ids = []
questions = []
answers = []
for i, sentences in enumerate(texts_last_sentences):
    the_question = [s for s in sentences if s.endswith("?")]
    if the_question:
        questions.append(the_question[-1])
        answers.append("".join(texts_last_sentences[i+1]))
        ids.append(i)

qa_df = pd.DataFrame(data={"question": questions, "answer": answers})

In [99]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,speaker,text
3999,3999,Argan,Je n'ai point encore vu la personne : mais on ...
4259,4259,Argan,"Parle bas, te dis-je."
4544,4544,Argan,Oui.


## Classic version of questions

In [13]:
import asyncio

In [14]:
asyncio.get_event_loop()

<_UnixSelectorEventLoop running=True closed=False debug=False>

In [73]:
import asyncio
import aiohttp

class RateLimitedHTTPClient:
    def __init__(self, delay):
        self.delay = delay
        self.session = aiohttp.ClientSession()

    async def request(self, method, url, **kwargs):
        await asyncio.sleep(self.delay)
        async with self.session.request(method, url, **kwargs) as response:
            return response

    async def close(self):
        await self.session.close()

In [79]:
import os
import asyncio
import openai

class TextRewriter:
    def __init__(self, prompt):
        openai.api_key = os.environ["OPENAI_API_KEY"]
        self.prompt = prompt
        self.client = openai.AsyncOpenAI()

    async def rewrite_text(self, text, semaphore):
        async with semaphore:
            response = await self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": self.prompt + " " + text},
                ],
                temperature=0.7,
                n=1
            )
            try:
                return response.choices[0].message.content
            except:
                return None

    async def rewrite_texts(self, texts):
        semaphore = asyncio.Semaphore(2)
        tasks = [self.rewrite_text(text, semaphore) for text in texts]
        rewritten_texts = await asyncio.gather(*tasks)
        return rewritten_texts

In [80]:
rewriter = TextRewriter("Reformule cette question dans un français simple et classique. Essaye de ne pas dépasser les 15 mots:")

In [None]:
questions_fr = await rewriter.rewrite_texts(qa_df.question.to_list()[:100])

# Target output

In [83]:
qa_df.iloc[0]

question    charmante Élise, vous devenez mélancolique, ap...
answer                         Je vous vois soupirer, hélas !
Name: 0, dtype: object

In [84]:
def craft_ft_openai(entry: pd.Series):
    system = {"role": "system", "content": "JB is a poet chatbot who answers everything in French just like Molière."}
    user = {"role": "user", "content": entry.question}
    assistant = {"role": "assistant", "content": entry.answer}
    return {"messages": [system, user, assistant]}

In [87]:
qa_ft_dataset = qa_df.apply(craft_ft_openai, axis=1).to_list()

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
qa_ft_train, qa_ft_test = train_test_split(qa_ft_dataset)

In [93]:
# write down
with open("qa_moliere_train.jsonl", "w") as f:
    lines = "\n".join([json.dumps(line) for line in qa_ft_train])
    f.writelines(lines)
# write down valid
with open("qa_moliere_test.jsonl", "w") as f:
    lines = "\n".join([json.dumps(line) for line in qa_ft_test])
    f.writelines(lines)

## Check data

In [98]:
data_path = "./qa_moliere_train.jsonl"
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Num examples: 1811
First example:
{'role': 'system', 'content': 'JB is a poet chatbot who answers everything in French just like Molière.'}
{'role': 'user', 'content': 'Quoi ?'}
{'role': 'assistant', 'content': "Ne vous ai-je pas recommandé de me venir dire d'abord tout ce que vous voyez ?"}
No errors found


# Fine tune

In [101]:
from openai import OpenAI
client = OpenAI()

In [106]:
response = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-1106:soprema:moliere:972h7fGK",
  messages=[
    {"role": "system", "content": "You are a helpful assistant speaking French"},
    {"role": "user", "content": "Comment décrirais-tu le système solaire ?"},
  ]
)
response.choices[0].message.content

'Le soleil visiblement arrêté, et les autres planètes obéissant à sa loi, tournant en cercles égaux autour de lui, dans une ordonnance étonnante, et à différentes vitesses : dans cette confusion si belle, chaque globe a son essence, sa forme, sa surface, sa durée, sa distance, ses cieux, sa latitude, ses zones, et ses capes, ses mers, ses golfes, ses promontoires, aussi bien que notre terre, que la nature a si délicieusement variée.'

In [107]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  messages=[
    {"role": "system", "content": "You are a helpful assistant speaking French"},
    {"role": "user", "content": "Comment décrirais-tu le système solaire ?"},
  ]
)
response.choices[0].message.content

"Le système solaire est composé d'une étoile centrale, le Soleil, autour de laquelle gravitent huit planètes : Mercure, Vénus, la Terre, Mars, Jupiter, Saturne, Uranus, et Neptune. En plus des planètes, il contient des astéroïdes, des comètes, des satellites naturels, et d'autres corps célestes. Le système solaire est également caractérisé par une ceinture d'astéroïdes entre Mars et Jupiter, et par la région au-delà de Neptune appelée le nuage d'Oort, où résident les comètes."