In [1]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [None]:
import csv
import os
from prompt import qa_system_prompt

csv_file = "/home/yosakoi/Work/chatbot/model/LLM/data/sample_data.csv"
jsonl_file = "/home/yosakoi/Work/chatbot/model/LLM/data/sample_data.jsonl"

if not os.path.exists(jsonl_file):
    with open(csv_file, 'r', encoding='utf-8') as f, open(jsonl_file, 'w', encoding='utf-8') as out_f:
        reader = csv.DictReader(f)
        for row in reader:
            conversation = {
                "messages": [
                    {"role": "system", "content": qa_system_prompt},
                    {"role": "user", "content": row["question"]},
                    {"role": "assistant", "content": row["answer"]}
                ]
            }
            out_f.write(json.dumps(conversation, ensure_ascii=False) + "\n")

    print(f"Conversion complete! JSONL saved to: {jsonl_file}")

else:
    print(f"Already have jsonl file.")

In [2]:
data_path = "/home/yosakoi/Work/chatbot/model/LLM/data/sample_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 99
First example:
{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}
{'role': 'user', 'content': 'Có thể sử dụng số điện thoại khác để đăng nhập không?'}
{'role': 'assistant', 'content': 'Có, nhưng số điện thoại đó phải được liên kết với tài khoản của bạn trước đó. Nếu số chưa được liên kết, bạn sẽ không thể sử dụng nó để đăng nhập.'}


In [3]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [4]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [5]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 104, 186
mean / median: 145.0909090909091, 146.0
p5 / p95: 128.8, 160.2

#### Distribution of num_assistant_tokens_per_example:
min / max: 26, 101
mean / median: 61.686868686868685, 63.0
p5 / p95: 45.0, 75.2

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning
