In [7]:
from datasets import load_dataset, Dataset, load_from_disk

dataset = load_dataset("camel-ai/physics")
dataset = dataset["train"]
dataset

Dataset({
    features: ['role_1', 'topic;', 'sub_topic', 'message_1', 'message_2'],
    num_rows: 20000
})

In [2]:
dataset[100]

{'role_1': 'Physicist_RoleType.ASSISTANT',
 'topic;': 'Quantum mechanics',
 'sub_topic': 'The hydrogen atom and its energy levels',
 'message_1': 'What is the energy difference between the ground state and the first excited state of a hydrogen atom?',
 'message_2': "The energy difference between the ground state (n=1) and the first excited state (n=2) of a hydrogen atom can be calculated using the Rydberg formula for energy levels:\n\nE_n = -13.6 eV / n^2\n\nwhere E_n is the energy of the electron in the nth energy level, and n is the principal quantum number.\n\nFirst, let's find the energy of the ground state (n=1):\n\nE_1 = -13.6 eV / 1^2 = -13.6 eV\n\nNow, let's find the energy of the first excited state (n=2):\n\nE_2 = -13.6 eV / 2^2 = -13.6 eV / 4 = -3.4 eV\n\nFinally, let's find the energy difference between the two states:\n\nΔE = E_2 - E_1 = -3.4 eV - (-13.6 eV) = 10.2 eV\n\nSo, the energy difference between the ground state and the first excited state of a hydrogen atom is 10

In [3]:
dataset.push_to_hub("dim/camel_ai_physics")

Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 24.00ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:24<00:00, 24.79s/it]


In [9]:
import tiktoken

# To get the tokeniser corresponding to a specific model in the OpenAI API:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [6]:
tokenizer.encode("hello world")

[15339, 1917]

In [10]:
tokenizer.encode("Привет мир")

[54745, 28089, 8341, 11562, 78746]

In [11]:
dataset = dataset.map(
    lambda x: {
        **x,
        "len_en": len(
            tokenizer.encode(x["message_1"] + x["message_2"]),
        ),
    },
    num_proc=20,
)

Map (num_proc=20): 100%|██████████| 20000/20000 [00:01<00:00, 12099.77 examples/s]


In [13]:
dataset = dataset.to_pandas()

In [14]:
dataset

Unnamed: 0,role_1,topic;,sub_topic,message_1,message_2,len_en
0,Physicist_RoleType.ASSISTANT,Quantum mechanics,The Schrödinger equation and its solutions,What is the probability of finding a particle ...,To find the probability of finding a particle ...,635
1,Physicist_RoleType.ASSISTANT,Quantum mechanics,The Schrödinger equation and its solutions,What is the time-independent Schrödinger equat...,The time-independent Schrödinger equation is a...,586
2,Physicist_RoleType.ASSISTANT,Quantum mechanics,The Schrödinger equation and its solutions,Determine the wave function and energy eigenva...,To determine the wave function and energy eige...,526
3,Physicist_RoleType.ASSISTANT,Quantum mechanics,The Schrödinger equation and its solutions,What are the possible energy levels and wave f...,"To solve this problem, we need to apply the ti...",871
4,Physicist_RoleType.ASSISTANT,Quantum mechanics,The Schrödinger equation and its solutions,If a particle is located in a one-dimensional ...,"Yes, I can help you find the possible energy l...",569
...,...,...,...,...,...,...
19995,Physicist_RoleType.ASSISTANT,Biophysics,Modeling the dynamics of protein-protein aggre...,"""How can we use mathematical modeling to deter...",To use mathematical modeling to determine the ...,502
19996,Physicist_RoleType.ASSISTANT,Biophysics,Modeling the dynamics of protein-protein aggre...,What are the physical mechanisms that govern t...,The physical mechanisms that govern the dynami...,450
19997,Physicist_RoleType.ASSISTANT,Biophysics,Modeling the dynamics of protein-protein aggre...,What is the impact of external physical factor...,"The impact of external physical factors, such ...",575
19998,Physicist_RoleType.ASSISTANT,Biophysics,Modeling the dynamics of protein-protein aggre...,How can we use mathematical modeling to predic...,Mathematical modeling can be a powerful tool f...,545


In [16]:
dataset['len_en'].sum() / 1000 * 0.0015

15.807661500000002