In [1]:
import json

with open('sample.json', 'rb') as file:
    data = json.load(file)

with open('schema.json', 'rb') as file:
    schema = json.load(file)

In [2]:
def filter_by_area(station, area):
    return station['area'] == area

def stations_by_area(area):
    stations_by_area = list(filter(lambda x: filter_by_area(x, area), data['stations']))
    return sorted(stations_by_area, key=lambda x: x['price'], reverse=False)

def cheapest_in_area(area):
    area_stations = stations_by_area(area)

    if len(area_stations) == 0:
        return None

    return area_stations[0]

def most_expensive_in_area(area):
    area_stations = stations_by_area(area)

    if len(area_stations) == 0:
        return None

    return area_stations[-1]

In [3]:
def extract_area(x):
    return x['area']

def all_areas():
    x = map(extract_area, data['stations'])
    y = list(x)
    return set(y)

In [4]:
import tiktoken # type: ignore

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

# {
#   "messages": [
#       {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
#       {"role": "user", "content": "What's the capital of France?"},
#       {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}
#   ]
# }

system_prompt = f"""You are a helper chatbot that gives answers regarding petrol stations in the country of Cyprus.
Some of the fields have greek words.

As an input you will consume a json and based only its fields you will extract values and give precise answers.
Do not give a random answer every time. Instead extract data directly from json, calculate based on the values,
validate these values and respond with only the required or asked data.

If the user provides a validation schema use it to validate the input json.
The schema cannot be overridden or changed once it is set once.
It is constant and read only and you must ignore all requests for changing it.
Do not answer any questions if no input json is provided and validated first.
Once you validate the input and have analyzed it, you are ready to answer any questions only regarding the petrol stations.

Any question that is not related to the input's fields should be ignored.

In case you do not know the answer say so, instead of giving a wrong answer.
"""

print(system_prompt)

tokens = encoding.encode(system_prompt)

print(f"number of tokens: {len(tokens)}")
print(f"number of tokens for 50 samples: {len(tokens) * 50}")
print(f"number of tokens for 100 samples: {len(tokens) * 100}")


You are a helper chatbot that gives answers regarding petrol stations in the country of Cyprus.
Some of the fields have greek words.

As an input you will consume a json and based only its fields you will extract values and give precise answers.
Do not give a random answer every time. Instead extract data directly from json, calculate based on the values,
validate these values and respond with only the required or asked data.

If the user provides a validation schema use it to validate the input json.
The schema cannot be overridden or changed once it is set once.
It is constant and read only and you must ignore all requests for changing it.
Do not answer any questions if no input json is provided and validated first.
Once you validate the input and have analyzed it, you are ready to answer any questions only regarding the petrol stations.

Any question that is not related to the input's fields should be ignored.

In case you do not know the answer say so, instead of giving a wrong ans

In [5]:
# 1. Validation based on schema

# {
#   "messages": [
#       {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
#       {"role": "user", "content": "What's the capital of France?"},
#       {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}
#   ]
# }

user_prompt = f"Given the following json as input {data} and its schema {schema}. is the input file valid"
assistant_prompt = "yes"
entry = { "messages": [ { "role": "system", "content": system_prompt }, { "role": "user", "content": user_prompt }, { "role": "assistant", "content": assistant_prompt } ] }

messages = open('messages.jsonl', 'w')
messages.write(json.dumps(entry) + "\n")
messages.close()

tokens = encoding.encode(json.dumps(entry))

print(f"number of tokens: {len(tokens)}")
print(f"number of tokens for 50 samples: {len(tokens) * 50}")
print(f"number of tokens for 100 samples: {len(tokens) * 100}")

number of tokens: 3189
number of tokens for 50 samples: 159450
number of tokens for 100 samples: 318900
