# ModelArena WalkThrough Section 1 - Meta

In [1]:
import string
import warnings
import numpy as np
import pandas as pd

from model_arena import ModelArena

# it is never the best practice to ignore warnings!
# however bytedmysql never tries to solve the warnings
# for better presentation here, we ignore these warnings
warnings.filterwarnings("ignore")
ma = ModelArena()

## Definition:

- raw dataset: dataset containing raw information
  
- instruction_template: template that combines all information into one single instruction

- dataset: dataset containing final instruction

- model: model with critical information of model_path, prompt_template, framework

## Prepare Raw Dataset

Suppose you have a raw dataset, with 100 samples.

The raw dataset contains two major components: *resolver_1*, *resolver_2*

Each sample with a specified tag and output: *tag*, *output*

In [2]:
n_samples = 100
df = pd.DataFrame()

tags = ["tag_a", "tag_b"]

df["tag"] = np.random.choice(tags, n_samples)
df["resolver_1"] = np.random.randint(0, 100, n_samples)
df["resolver_2"] = ["".join(np.random.choice(list(string.ascii_lowercase), 5)) for _ in np.arange(n_samples)]
df["output"] = df["resolver_1"] + 10000

df

Unnamed: 0,tag,resolver_1,resolver_2,output
0,tag_a,71,hdzag,10071
1,tag_a,13,rwqzc,10013
2,tag_a,95,rouyq,10095
3,tag_b,6,qkqce,10006
4,tag_a,67,hbrdc,10067
...,...,...,...,...
95,tag_a,94,ypzlz,10094
96,tag_b,19,fnwpm,10019
97,tag_b,82,artze,10082
98,tag_b,97,ooooe,10097


You can concat the relevant **information** into one single column using: 

In [3]:
df["information"] = df[["resolver_1", "resolver_2"]].to_dict(orient="records")
df = df[["tag", "information", "output"]]
df

Unnamed: 0,tag,information,output
0,tag_a,"{'resolver_1': 71, 'resolver_2': 'hdzag'}",10071
1,tag_a,"{'resolver_1': 13, 'resolver_2': 'rwqzc'}",10013
2,tag_a,"{'resolver_1': 95, 'resolver_2': 'rouyq'}",10095
3,tag_b,"{'resolver_1': 6, 'resolver_2': 'qkqce'}",10006
4,tag_a,"{'resolver_1': 67, 'resolver_2': 'hbrdc'}",10067
...,...,...,...
95,tag_a,"{'resolver_1': 94, 'resolver_2': 'ypzlz'}",10094
96,tag_b,"{'resolver_1': 19, 'resolver_2': 'fnwpm'}",10019
97,tag_b,"{'resolver_1': 82, 'resolver_2': 'artze'}",10082
98,tag_b,"{'resolver_1': 97, 'resolver_2': 'ooooe'}",10097


## Raw Dataset Operation

You can simply use *add* function to add a new raw dataset, or use *get* function to retrieve the data.

In [4]:
raw_dataset_name_1 = "".join(np.random.choice(list(string.ascii_lowercase), 10))
ma.datasets.raw_datasets.add(dataset=raw_dataset_name_1, df=df)
ma.datasets.raw_datasets.get(raw_dataset_name_1)

Unnamed: 0,dataset_name,dataset_id,tag,information,output
0,dqguyljiqe,40a11df8a3414a8a882f11c7997b06a5,tag_a,"{""resolver_1"": 71, ""resolver_2"": ""hdzag""}",10071
1,dqguyljiqe,d963f25bd141425cbd1d75675b9974f3,tag_a,"{""resolver_1"": 13, ""resolver_2"": ""rwqzc""}",10013
2,dqguyljiqe,9f3c215723dd4bd9b417abdcd51bc67b,tag_a,"{""resolver_1"": 95, ""resolver_2"": ""rouyq""}",10095
3,dqguyljiqe,ac85ed4a4ee34d0a8d17361be78d0a64,tag_a,"{""resolver_1"": 67, ""resolver_2"": ""hbrdc""}",10067
4,dqguyljiqe,a59a27e5b500468bb372ac47edcb3508,tag_a,"{""resolver_1"": 93, ""resolver_2"": ""rjqxa""}",10093
...,...,...,...,...,...
95,dqguyljiqe,95ca7a4594a3424bb794c5973eb29e1a,tag_b,"{""resolver_1"": 76, ""resolver_2"": ""haldo""}",10076
96,dqguyljiqe,bbf4bd1ce5c14596a122b3877c38325b,tag_b,"{""resolver_1"": 15, ""resolver_2"": ""qacbl""}",10015
97,dqguyljiqe,e2f0c8731a2d47b6b2a6bd893cf1f715,tag_b,"{""resolver_1"": 19, ""resolver_2"": ""fnwpm""}",10019
98,dqguyljiqe,0e405f7933dd408b931971918b192942,tag_b,"{""resolver_1"": 82, ""resolver_2"": ""artze""}",10082


**NOTE**: The *output* column is optional in dataset preparation.

In [5]:
raw_dataset_name_2 = "".join(np.random.choice(list(string.ascii_lowercase), 10))
ma.datasets.raw_datasets.add(dataset=raw_dataset_name_2, df=df[["tag", "information"]])
ma.datasets.raw_datasets.get(raw_dataset_name_2)

Unnamed: 0,dataset_name,dataset_id,tag,information,output
0,kkuahrvsqv,64b0a3da169542aba0bc6b995b476694,tag_a,"{""resolver_1"": 71, ""resolver_2"": ""hdzag""}",
1,kkuahrvsqv,b754efad56754ab3b20eae1569300881,tag_a,"{""resolver_1"": 13, ""resolver_2"": ""rwqzc""}",
2,kkuahrvsqv,9b52fdde2661423393527e4bb00c7374,tag_a,"{""resolver_1"": 95, ""resolver_2"": ""rouyq""}",
3,kkuahrvsqv,8b20eef17ed0468b92c67fd10097c372,tag_a,"{""resolver_1"": 67, ""resolver_2"": ""hbrdc""}",
4,kkuahrvsqv,00014e93e76a457c89b2207aa35a836b,tag_a,"{""resolver_1"": 93, ""resolver_2"": ""rjqxa""}",
...,...,...,...,...,...
95,kkuahrvsqv,04a967d8f7d5428791d58c24c117a9d3,tag_b,"{""resolver_1"": 76, ""resolver_2"": ""haldo""}",
96,kkuahrvsqv,56941a6a1ff3499ab5359d68c2956c9b,tag_b,"{""resolver_1"": 15, ""resolver_2"": ""qacbl""}",
97,kkuahrvsqv,c0b06ff3c1e74d3084ff04db9f753468,tag_b,"{""resolver_1"": 19, ""resolver_2"": ""fnwpm""}",
98,kkuahrvsqv,d0e0f9366e1943aabccec8bd25565e15,tag_b,"{""resolver_1"": 82, ""resolver_2"": ""artze""}",


## Dataset Operation

Once you have add the raw dataset, you can dive into the dataset operation.

You can also use *add* function to add a new dataset, or use *get* function to retrieve the data.

**NOTE**: dataset uses instruction template to combine all information inside raw dataset

In [6]:
# we use both two components inside information
instruction_template = "The file {resolver_2} has a fake number {resolver_1}."

dataset_name_1 = "".join(np.random.choice(list(string.ascii_lowercase), 10))

ma.datasets.add(
    dataset=dataset_name_1,
    records={
        "raw_dataset_name": raw_dataset_name_1,
        "instruction_template": instruction_template,
    }
)

ma.datasets.get(dataset_name_1)

Unnamed: 0,dataset_name,dataset_id,raw_dataset_id,tag,instruction,output
0,eqlrbbvdfl,09d6f620bd6a4f22b783f78d7ee647d0,40a11df8a3414a8a882f11c7997b06a5,tag_a,The file hdzag has a fake number 71.,10071
1,eqlrbbvdfl,01fc5d917f0e4acd9375c7298f0cf9cb,d963f25bd141425cbd1d75675b9974f3,tag_a,The file rwqzc has a fake number 13.,10013
2,eqlrbbvdfl,e2d46f916cdd438d861892ea1a7a4a30,9f3c215723dd4bd9b417abdcd51bc67b,tag_a,The file rouyq has a fake number 95.,10095
3,eqlrbbvdfl,96b55b871b624319a6db20613aafcf9a,ac85ed4a4ee34d0a8d17361be78d0a64,tag_a,The file hbrdc has a fake number 67.,10067
4,eqlrbbvdfl,c27e8af24e5140508d978ea981c48029,a59a27e5b500468bb372ac47edcb3508,tag_a,The file rjqxa has a fake number 93.,10093
...,...,...,...,...,...,...
95,eqlrbbvdfl,3e3ccc2eff0d4aba9e26ac375b61f235,95ca7a4594a3424bb794c5973eb29e1a,tag_b,The file haldo has a fake number 76.,10076
96,eqlrbbvdfl,0032eba0b58e460d85439dc0a52dbd3f,bbf4bd1ce5c14596a122b3877c38325b,tag_b,The file qacbl has a fake number 15.,10015
97,eqlrbbvdfl,6ed7bb5deac14343a066dec840d8abe8,e2f0c8731a2d47b6b2a6bd893cf1f715,tag_b,The file fnwpm has a fake number 19.,10019
98,eqlrbbvdfl,2e6c0dadbb33453eacdea71d72d6b5be,0e405f7933dd408b931971918b192942,tag_b,The file artze has a fake number 82.,10082


After some digging, we find out that *resolver_2* is irrelevant to our task, so we decide to remove it and construct a new dataset.

In [7]:
# now, we only use resolver_1 inside information
instruction_template = "The fake number is {resolver_1}."

dataset_name_2 = "".join(np.random.choice(list(string.ascii_lowercase), 10))

ma.datasets.add(
    dataset=dataset_name_2,
    records={
        "raw_dataset_name": raw_dataset_name_1,
        "instruction_template": instruction_template,
    }
)

ma.datasets.get(dataset_name_2)

Unnamed: 0,dataset_name,dataset_id,raw_dataset_id,tag,instruction,output
0,fiigezjpbp,dc16aaf9e61a4d828dde480a1f1657e0,40a11df8a3414a8a882f11c7997b06a5,tag_a,The fake number is 71.,10071
1,fiigezjpbp,520c8c1963e2441985576d73194d3272,d963f25bd141425cbd1d75675b9974f3,tag_a,The fake number is 13.,10013
2,fiigezjpbp,2670369b61a640f6948f573b032b01d7,9f3c215723dd4bd9b417abdcd51bc67b,tag_a,The fake number is 95.,10095
3,fiigezjpbp,bb1217bbc63641d19d12cf28447303e3,ac85ed4a4ee34d0a8d17361be78d0a64,tag_a,The fake number is 67.,10067
4,fiigezjpbp,a027e95901ff4e12bc6c65f5db19cff7,a59a27e5b500468bb372ac47edcb3508,tag_a,The fake number is 93.,10093
...,...,...,...,...,...,...
95,fiigezjpbp,c2492efabe314a70bfa8eec923e542e3,95ca7a4594a3424bb794c5973eb29e1a,tag_b,The fake number is 76.,10076
96,fiigezjpbp,377c86261e4e407888b97d750075b2ff,bbf4bd1ce5c14596a122b3877c38325b,tag_b,The fake number is 15.,10015
97,fiigezjpbp,e58d0a76dce644ce982dd9acef62fdf3,e2f0c8731a2d47b6b2a6bd893cf1f715,tag_b,The fake number is 19.,10019
98,fiigezjpbp,9315d2c4a67047e280613992484f6075,0e405f7933dd408b931971918b192942,tag_b,The fake number is 82.,10082


## Model Operation

You can still use *add* function to add a new model, or use *get* function to retrieve the model information.

**NOTE**: the prompt template **MUST** have *{instruction}* inside, which will fill the dataset instruction accordingly.

In [8]:
model = f"a new calculator model, version {''.join(np.random.choice(list(string.ascii_lowercase), 3))}"
model_path = "/mnt/bn/..."
prompt_template = "Given a number, you should return the addition of 10000 and that number.\n{instruction}"
framework = "huggingface"  # the inference framework

ma.models.add(
    model=model,
    records={
        "model_path": model_path,
        "prompt_template": prompt_template,
        "framework": framework,
        # optional
        "author": "agi god",
        "remark": "this is an agi model specialized in calculation, blah",
    }
)

ma.models.get(model)

Unnamed: 0,model_name,model_id,model_path,framework,author,prompt_template,remark
0,"a new calculator model, version vse",519bffa7d2304a559f68831098051752,/mnt/bn/...,huggingface,agi god,"Given a number, you should return the addition...",this is an agi model specialized in calculatio...


**ALL SET** !!

Let's have a peek of what inference will like in the following section.

In [9]:
inferences = ma.generate_inferences(dataset=dataset_name_1, model=model)
inferences

You have directly call `generate_inferences` to acquire a dataframe, this will build [('prompt', <class 'str'>), ('output', <class 'str'>)] columns in the dataframe. Please remeber to fill them, before call `add_inferences`.


Unnamed: 0,dataset_name,dataset_id,model_id,prompt,output
0,eqlrbbvdfl,09d6f620bd6a4f22b783f78d7ee647d0,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
1,eqlrbbvdfl,01fc5d917f0e4acd9375c7298f0cf9cb,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
2,eqlrbbvdfl,e2d46f916cdd438d861892ea1a7a4a30,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
3,eqlrbbvdfl,96b55b871b624319a6db20613aafcf9a,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
4,eqlrbbvdfl,c27e8af24e5140508d978ea981c48029,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
...,...,...,...,...,...
95,eqlrbbvdfl,3e3ccc2eff0d4aba9e26ac375b61f235,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
96,eqlrbbvdfl,0032eba0b58e460d85439dc0a52dbd3f,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
97,eqlrbbvdfl,6ed7bb5deac14343a066dec840d8abe8,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",
98,eqlrbbvdfl,2e6c0dadbb33453eacdea71d72d6b5be,519bffa7d2304a559f68831098051752,"Given a number, you should return the addition...",


## **HOWTO** drop these meta?

You can simply use *drop* function to drop all meta data.
Always rember to use this function wth cautions.

Sadly, we have to remove all fake datasets and models in this section.

In [10]:
ma.datasets.raw_datasets.drop(raw_dataset_name_1)
ma.datasets.raw_datasets.drop(raw_dataset_name_2)
ma.datasets.drop(dataset_name_1)
ma.datasets.drop(dataset_name_2)
ma.models.drop(model)