<!-- Codes by HTMLcodes.ws -->
<h1 style = "color:Blue;font-family:newtimeroman;font-size:250%;text-align:center;border-radius:15px 50px;">Notebook for generating Mental Health prompt-response pairs in JSONL prepared for fine-tuning</h1>

In [1]:
import json
import tiktoken

In [2]:
# You can run this cell to check tiktoken is correctly installed and available
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [3]:
# Function available at https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [4]:
# Open text file with conversations in the following format
# User: I'm feeling really anxious lately
# Chatbot: It's understandable to feel anxious...

lista_jsonlines = []

try:
    infile = open('conv-ft1.txt', 'r')
    outfile = open('conv-ft1.jsonl', 'w')
    inline = infile.readline()
    entries_count = 0
    while inline:
        if (inline[0:4] == "User"):
            user_line = inline[6:]
        else:
            assi_line = inline[9:]
            json_line = {"messages": [{"role": "system",    "content": "You are an AI assistant trying to help a young user that may have mental health concerns and does not have direct access to a professional psychologist."},
                                      {"role": "user",      "content": user_line},
                                      {"role": "assistant", "content": assi_line}
                                     ]
                        }
            print(json.dumps(json_line), file=outfile)
            lista_jsonlines.append(json_line)
            entries_count = entries_count + 1
        inline = infile.readline()
except IOError:
    print("Something went wrong while opening files")
else:
    infile.close()
    outfile.close()    
    print("Process completed")
    print(f"Entries dumped to JSONL file: {entries_count}")
    print()

Process completed
Entries dumped to JSONL file: 100



In [5]:
# Estimated token count for GPT model, so we can calculate prices at
# https://azure.microsoft.com/es-es/pricing/calculator/
for model in [
    "gpt-3.5-turbo-0301",
    "gpt-3.5-turbo-0613",
    "gpt-4-0314",
    "gpt-4-0613",
    ]:
    print(model)
    total_tokens = 0
    for json_line in lista_jsonlines:
        total_tokens += num_tokens_from_messages(json_line['messages'], model)
    print(f"{total_tokens} prompt tokens counted by num_tokens_from_messages().")
print()

gpt-3.5-turbo-0301
17542 prompt tokens counted by num_tokens_from_messages().
gpt-3.5-turbo-0613
17242 prompt tokens counted by num_tokens_from_messages().
gpt-4-0314
17242 prompt tokens counted by num_tokens_from_messages().
gpt-4-0613
17242 prompt tokens counted by num_tokens_from_messages().

