In [1]:
import pandas as pd
import json

In [2]:
with open("interactions_v1.json") as json_file:
    data = json.load(json_file)

In [5]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,confidence,interaction,sol_id,interaction_id,choices,question,display_name,name,created_at,model_type,instruction_prefix,user_prefix,assistant_prefix,is_open,prediction,prompt,instruction
0,4,"[{'role': 'system', 'content': 'Give a best ap...",1693719,4399371,,,,,,,,,,,,,
1,5,"[{'role': 'system', 'content': 'The context is...",2001870,4768577,,,,,,,,,,,,,
2,5,"[{'role': 'system', 'content': 'Explain in one...",1477066,4510626,,,,,,,,,,,,,
3,5,"[{'role': 'system', 'content': 'Explain in one...",2578719,4085955,,,,,,,,,,,,,
4,5,"[{'role': 'system', 'content': 'The context is...",1095453,4146802,,,,,,,,,,,,,


## Cleaning


Valid scores

In [4]:
# take only on a valid range of confidence
df = df[(df["confidence"] < 6) & (df["confidence"] >= 0)]

Remove duplicates rows

In [5]:
df = df.drop_duplicates(subset=["interaction"])

# Analysis

- Amount of NaN values in each column

In [6]:
nan_values_per_column = df.isna().sum()
print(nan_values_per_column)
len(df)

confidence                0
interaction               0
sol_id                    0
interaction_id            0
choices                9962
question               9804
display_name          10431
name                  10396
created_at            10477
model_type            10477
instruction_prefix    10477
user_prefix           10477
assistant_prefix      10477
is_open               10469
prediction            10469
prompt                10468
instruction           10468
dtype: int64


10560

### df_new

Intermediate dataframe to fix wrong rows


1. We remove created_at, display_name, name, model_type. Because after an analysis we notice that these 4 columns doesn't give util information
2. Generate a new dataFrame with the rows with at least 1 Non NaN value

In [7]:
df_new = df.drop(["display_name", "name", "created_at", "model_type"], axis=1)
columns = [
    "choices",
    "question",
    "instruction_prefix",
    "user_prefix",
    "is_open",
    "assistant_prefix",
    "prediction",
    "prompt",
    "instruction",
]
df_new = df_new.dropna(subset=columns, how="all")
print(len(df_new))
df_new.head()

887


Unnamed: 0,confidence,interaction,sol_id,interaction_id,choices,question,instruction_prefix,user_prefix,assistant_prefix,is_open,prediction,prompt,instruction
1434,4,"[{'role': 'system', 'content': 'You are an AI ...",1400772,4634059,[Because the neurotransmitter receptors are se...,Why is the synaptic conductance depending on t...,,,,,,,
1435,2,"[{'role': 'system', 'content': 'You are an AI ...",2263436,4939525,"[IR-DIC requires stained tissue, IR-DIC allow ...",Which of the following statements are correct?,,,,,,,
1436,5,"[{'role': 'system', 'content': 'You are an AI ...",2378047,4265641,[Because literature contains valuable yet unst...,Why do projects like the Blue Brain Project us...,,,,,,,
1437,4,"[{'role': 'system', 'content': 'You are an AI ...",2317550,4308877,[Infer the parameters from other (related) and...,Which of the following options (4) can you cho...,,,,,,,
1438,4,"[{'role': 'system', 'content': 'You are an AI ...",1673629,4489638,[Voltage difference between the inside of the ...,What did Hodgkin and Huxley record in the firs...,,,,,,,


### df_initial

we obtain the dataset of the initial data without nan_values

In [8]:
# match the ones that are on df but not on df_new
indexes_to_delete = df_new.index.tolist()
df_initial = df.drop(indexes_to_delete, axis=0)
print("len of :", len(df_initial))
df_initial.head()

len of : 9673


Unnamed: 0,confidence,interaction,sol_id,interaction_id,choices,question,display_name,name,created_at,model_type,instruction_prefix,user_prefix,assistant_prefix,is_open,prediction,prompt,instruction
0,4,"[{'role': 'system', 'content': 'Give a best ap...",1693719,4399371,,,,,,,,,,,,,
1,5,"[{'role': 'system', 'content': 'The context is...",2001870,4768577,,,,,,,,,,,,,
2,5,"[{'role': 'system', 'content': 'Explain in one...",1477066,4510626,,,,,,,,,,,,,
3,5,"[{'role': 'system', 'content': 'Explain in one...",2578719,4085955,,,,,,,,,,,,,
4,5,"[{'role': 'system', 'content': 'The context is...",1095453,4146802,,,,,,,,,,,,,


## NaN values analysis

### 1. Choices Column

In [9]:
df_without_nan = df_new.dropna(subset=["choices"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices                 0
question               48
instruction_prefix    598
user_prefix           598
assistant_prefix      598
is_open               507
prediction            507
prompt                531
instruction           531
dtype: int64


598

#### 2. Question Column

In [10]:
df_without_nan = df_new.dropna(subset=["question"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               206
question                0
instruction_prefix    756
user_prefix           756
assistant_prefix      756
is_open               665
prediction            665
prompt                664
instruction           664
dtype: int64


756

#### 3. Instruction prefix Column

In [11]:
df_without_nan = df_new.dropna(subset=["instruction_prefix"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               83
question              83
instruction_prefix     0
user_prefix            0
assistant_prefix       0
is_open               83
prediction            83
prompt                83
instruction           83
dtype: int64


83

#### 4. User prefix Column

In [12]:
df_without_nan = df_new.dropna(subset=["user_prefix"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               83
question              83
instruction_prefix     0
user_prefix            0
assistant_prefix       0
is_open               83
prediction            83
prompt                83
instruction           83
dtype: int64


83

#### 5. Assistant Prefix Column

In [13]:
df_without_nan = df_new.dropna(subset=["assistant_prefix"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               83
question              83
instruction_prefix     0
user_prefix            0
assistant_prefix       0
is_open               83
prediction            83
prompt                83
instruction           83
dtype: int64


83

#### 6. Prompt column

In [14]:
df_without_nan = df_new.dropna(subset=["prompt"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               25
question               0
instruction_prefix    92
user_prefix           92
assistant_prefix      92
is_open               92
prediction            92
prompt                 0
instruction            0
dtype: int64


92

#### 7. Instruction Column

In [15]:
df_without_nan = df_new.dropna(subset=["instruction"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               25
question               0
instruction_prefix    92
user_prefix           92
assistant_prefix      92
is_open               92
prediction            92
prompt                 0
instruction            0
dtype: int64


92

## Conclusion

After an extensive analysis, we notice that there exist two main groups:

1. First group -> 83 rows
   - name
   - created_at
   - model_type
   - instruction_prefix
   - user_prefix
   - assistant_prefix

2. Second group ->  804 rows
   -  choices
   -  question
   -  display_name
   -  is_open
   -  prediction
   -  prompt
   -  instruction   


Now this group have three subgroups:

2.1 **Instruction promtp group** -> 92 Rows

In this group the user separted the interactions with the chat from the instruction of the system and the prompt

2.2 **Prediction group** -> 91 Rows

In this group the user separated on the first interaction, then add the choices and finally the last response

2.3 **Question**

# Divison and new dataset

In [16]:
len(df_new)

887

In [17]:
first_group = ["instruction_prefix", "user_prefix", "assistant_prefix"]
second_group = ["choices", "question", "prediction", "prompt", "instruction", "is_open"]

df_first = df_new.dropna(subset=first_group, how="all")
df_first = df_first.drop(columns=second_group)
print(len(df_first))
df_first.head()

83


Unnamed: 0,confidence,interaction,sol_id,interaction_id,instruction_prefix,user_prefix,assistant_prefix
4354,4,"[{'role': 'user', 'content': ' Prove the given...",2414321,4304955,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4355,3,"[{'role': 'user', 'content': ' Your task is to...",2370873,4545561,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4356,1,"[{'role': 'user', 'content': ' Your task is to...",1501810,4454730,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4357,5,"[{'role': 'user', 'content': ' Answer the give...",1770795,4113365,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4358,1,"[{'role': 'user', 'content': ' Your task is to...",1568606,4121414,You are a math assistant whose main area of ex...,My request:,Let's think step by step


## 1. First group

In [18]:
nan_values_per_column = df_first.isna().sum()
# Print the count of NaN values per column
print(nan_values_per_column)
# print amount of values
len(df_first)

confidence            0
interaction           0
sol_id                0
interaction_id        0
instruction_prefix    0
user_prefix           0
assistant_prefix      0
dtype: int64


83

In [19]:
def addSystem(example):
    intermediate = example["interaction"]
    # add at posiiton 0
    new = {"role": "system", "content": example["instruction_prefix"]}
    intermediate.insert(0, new)
    example["interaction"] = intermediate
    return example


df_first = df_first.apply(addSystem, axis=1)

In [20]:
df_first.head()

Unnamed: 0,confidence,interaction,sol_id,interaction_id,instruction_prefix,user_prefix,assistant_prefix
4354,4,"[{'role': 'system', 'content': 'You are a math...",2414321,4304955,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4355,3,"[{'role': 'system', 'content': 'You are a math...",2370873,4545561,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4356,1,"[{'role': 'system', 'content': 'You are a math...",1501810,4454730,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4357,5,"[{'role': 'system', 'content': 'You are a math...",1770795,4113365,You are a math assistant whose main area of ex...,My request:,Let's think step by step
4358,1,"[{'role': 'system', 'content': 'You are a math...",1568606,4121414,You are a math assistant whose main area of ex...,My request:,Let's think step by step


## 2. second group

In [21]:
df_second = df_new.dropna(subset=second_group, how="all")
# df_second = df_second.drop(columns=first_group)
print(len(df_second))

804


In [22]:
nan_values_per_column = df_second.isna().sum()
print(nan_values_per_column)
len(df_second)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               206
question               48
instruction_prefix    804
user_prefix           804
assistant_prefix      804
is_open               713
prediction            713
prompt                712
instruction           712
dtype: int64


804

### 2.1 group third 

Here we divide the first group into two groups, group three and four.

In [23]:
third = ["instruction", "prompt"]
df_second_1 = df_second.dropna(subset=third, how="all")
# df_second = df_second.drop(columns=third)

In [24]:
nan_values_per_column = df_second_1.isna().sum()
print(nan_values_per_column)
len(df_second_1)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices               25
question               0
instruction_prefix    92
user_prefix           92
assistant_prefix      92
is_open               92
prediction            92
prompt                 0
instruction            0
dtype: int64


92

fix the interactions to the correct ones

In [25]:
def create_sub(example):
    initial = example["interaction"]
    intermediate = example["interaction"]
    # add at posiiton 0
    system = {"role": "system", "content": example["instruction"]}
    user = {"role": "user", "content": example["prompt"]}
    sub = [system, user]
    example["interaction"] = sub + intermediate
    return example


df_second_1 = df_second_1.apply(create_sub, axis=1)

Add the columns to mix with df_initial

### Redfine df_second

In [26]:
indexes_to_delete = df_second_1.index.tolist()
df_second = df_second.drop(indexes_to_delete, axis=0)

In [27]:
nan_values_per_column = df_second.isna().sum()
print(nan_values_per_column)
len(df_second)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               181
question               48
instruction_prefix    712
user_prefix           712
assistant_prefix      712
is_open               621
prediction            621
prompt                712
instruction           712
dtype: int64


712

### 2.2 group df forth

In [28]:
forth = ["prediction"]
df_second_2 = df_second.dropna(subset=forth, how="all")
# df_second = df_second.drop(columns=forth)

In [29]:
df_without_nan = df_second_2.dropna(subset=["prediction"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices                0
question               0
instruction_prefix    91
user_prefix           91
assistant_prefix      91
is_open                0
prediction             0
prompt                91
instruction           91
dtype: int64


91

In [30]:
def create_sub(example):
    intermediate = example["interaction"]
    # add at posiiton 0
    content = example["question"] + "\n"
    if not example["is_open"]:
        content += "Here are the choices:\n"
        for choice in example["choices"]:
            content += " " + choice + "\n"
    content += "Let's think step by step\n"
    # I need to find if  "I apologize for my mistake in the previous answer." is in example['prediction']
    predict = example["prediction"]
    if "I apologize" in predict:
        predict = predict.split(".")[1:]
        predict = ".".join(predict)

    user = {"role": "user", "content": content}
    assistant = {"role": "assistant", "content": predict}
    intermediate = [user, assistant]
    example["interaction"] = intermediate
    return example


df_second_2 = df_second_2.apply(create_sub, axis=1)

### Redfine df_second time

In [31]:
indexes_to_delete = df_second_2.index.tolist()
df_second = df_second.drop(indexes_to_delete, axis=0)
# df_second = df_second.drop(columns=forth)
# df_second = df_second.drop(columns=third)
# df_second = df_second.drop(columns=['is_open'])

In [32]:
nan_values_per_column = df_second.isna().sum()
# Print the count of NaN values per column
print(nan_values_per_column)
# print amount of values
len(df_second)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               181
question               48
instruction_prefix    621
user_prefix           621
assistant_prefix      621
is_open               621
prediction            621
prompt                621
instruction           621
dtype: int64


621

### redine df second again

In [33]:
df_without_nan = df_second.dropna(subset=["choices"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices                 0
question               48
instruction_prefix    440
user_prefix           440
assistant_prefix      440
is_open               440
prediction            440
prompt                440
instruction           440
dtype: int64


440

In [34]:
df_without_nan = df_second.dropna(subset=["question"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               181
question                0
instruction_prefix    573
user_prefix           573
assistant_prefix      573
is_open               573
prediction            573
prompt                573
instruction           573
dtype: int64


573

In [35]:
df_without_nan = df_second.dropna(subset=["question", "choices"])

nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices                 0
question                0
instruction_prefix    392
user_prefix           392
assistant_prefix      392
is_open               392
prediction            392
prompt                392
instruction           392
dtype: int64


392

### 2.3 question and choices

In [36]:
df_second_3 = df_second.dropna(subset=["question", "choices"])

nan_values_per_column = df_second_3.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices                 0
question                0
instruction_prefix    392
user_prefix           392
assistant_prefix      392
is_open               392
prediction            392
prompt                392
instruction           392
dtype: int64


392

In [37]:
def findUser(example):
    interactions = example["interaction"]
    roles = set()
    for interaction in interactions:
        role = interaction["role"]
        roles.add(role)

    if "user" not in roles:
        new = [
            {
                "role": "user",
                "content": example["question"]
                + "\n"
                + "Here are the choices:\n"
                + " ".join(example["choices"]),
            }
        ]
        example["interaction"] = new + interactions

    return example


df_second_3 = df_second_3.apply(findUser, axis=1)

### redefine df second again

In [38]:
indexes_to_delete = df_second_3.index.tolist()
df_second = df_second.drop(indexes_to_delete, axis=0)
print(len(df_second))
df_second.head()

229


Unnamed: 0,confidence,interaction,sol_id,interaction_id,choices,question,instruction_prefix,user_prefix,assistant_prefix,is_open,prediction,prompt,instruction
1454,4,"[{'role': 'system', 'content': 'You are an AI ...",2898230,4850941,,What is the probability of a successful transm...,,,,,,,
1492,1,"[{'role': 'system', 'content': 'You are an AI ...",1738824,4833162,,"Reorder (separated by commas, e.g. A,B,C,D,E,F...",,,,,,,
2183,1,"[{'role': 'system', 'content': 'Tu es un profe...",1241889,4369221,,Considérez deux plans infinis uniformément cha...,,,,,,,
2184,4,"[{'role': 'user', 'content': 'Les électrodes d...",1556723,4341974,,"Les électrodes d'un condensateur plan, de surf...",,,,,,,
2185,4,"[{'role': 'user', 'content': 'Deux disques A e...",2971874,4080685,,"Deux disques A et B de rayon R, alignés, peuve...",,,,,,,


### 2.4 define Question column

In [39]:
df_second_4 = df_second.dropna(subset=["question"])
nan_values_per_column = df_second_4.isna().sum()
print(nan_values_per_column)
len(df_second_4)

confidence              0
interaction             0
sol_id                  0
interaction_id          0
choices               181
question                0
instruction_prefix    181
user_prefix           181
assistant_prefix      181
is_open               181
prediction            181
prompt                181
instruction           181
dtype: int64


181

In [40]:
def findQuestion(example):
    interactions = example["interaction"]
    roles = set()
    for interaction in interactions:
        role = interaction["role"]
        roles.add(role)
        if role == "user":
            content = interaction["content"]
            if example["question"] in content:
                return example
            break
    if "user" not in roles:
        new = [{"role": "user", "content": example["question"] + "\n"}]
        example["interaction"] = new + interactions
    return example


df_second_4 = df_second_4.apply(findQuestion, axis=1)

### Redefine df second again

In [41]:
indexes_to_delete = df_second_4.index.tolist()
df_second = df_second.drop(indexes_to_delete, axis=0)
print(len(df_second))
df_second.head()

48


Unnamed: 0,confidence,interaction,sol_id,interaction_id,choices,question,instruction_prefix,user_prefix,assistant_prefix,is_open,prediction,prompt,instruction
2316,2,"[{'role': 'system', 'content': 'c'est une ques...",2517410,4754452,"[6, 10, 12, 16]",,,,,,,,
2335,5,"[{'role': 'system', 'content': 'Question sur l...",2436227,4194655,"[a + 0 = 0, a + 0 = 1, a + 0 = a, a + 1 = a]",,,,,,,,
2345,5,"[{'role': 'system', 'content': 'Question à cho...",2551041,4690318,"[26^7, 7^{26}, 52^7, 7^{52}]",,,,,,,,
2354,4,"[{'role': 'system', 'content': 'question à cho...",2337413,4456018,[a) XI (t) = X(t) pour tout t ∈ R lorsque la f...,,,,,,,,
2359,3,"[{'role': 'system', 'content': 'question d'ent...",2188632,4419492,,,,,,,,,


In [42]:
df_without_nan = df_second.dropna(subset=["choices"])
nan_values_per_column = df_without_nan.isna().sum()
print(nan_values_per_column)
len(df_without_nan)

confidence             0
interaction            0
sol_id                 0
interaction_id         0
choices                0
question              48
instruction_prefix    48
user_prefix           48
assistant_prefix      48
is_open               48
prediction            48
prompt                48
instruction           48
dtype: int64


48

# Final converstion and concatenation

In [43]:
df_second_1 = df_second_1.drop(
    columns=[
        "choices",
        "question",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)
df_second_2 = df_second_2.drop(
    columns=[
        "choices",
        "question",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)
df_second_3 = df_second_3.drop(
    columns=[
        "choices",
        "question",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)
df_second_4 = df_second_4.drop(
    columns=[
        "choices",
        "question",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)
df_second = df_second.drop(
    columns=[
        "choices",
        "question",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)

In [44]:
df_first = df_first.drop(
    columns=["instruction_prefix", "user_prefix", "assistant_prefix"]
)

In [45]:
df_initial = df_initial.drop(
    columns=[
        "choices",
        "question",
        "display_name",
        "name",
        "created_at",
        "model_type",
        "instruction_prefix",
        "user_prefix",
        "assistant_prefix",
        "is_open",
        "prediction",
        "prompt",
        "instruction",
    ]
)

In [46]:
df_combined = pd.concat(
    [
        df_first,
        df_second_1,
        df_second_2,
        df_second_3,
        df_second_4,
        df_second,
        df_initial,
    ],
    axis=0,
)

In [47]:
df_combined

Unnamed: 0,confidence,interaction,sol_id,interaction_id
4354,4,"[{'role': 'system', 'content': 'You are a math...",2414321,4304955
4355,3,"[{'role': 'system', 'content': 'You are a math...",2370873,4545561
4356,1,"[{'role': 'system', 'content': 'You are a math...",1501810,4454730
4357,5,"[{'role': 'system', 'content': 'You are a math...",1770795,4113365
4358,1,"[{'role': 'system', 'content': 'You are a math...",1568606,4121414
...,...,...,...,...
10830,5,"[{'role': 'system', 'content': ''}, {'role': '...",2313193,4154532
10831,5,"[{'role': 'system', 'content': ''}, {'role': '...",1640699,4805305
10832,4,"[{'role': 'system', 'content': ''}, {'role': '...",1834070,4476719
10833,4,"[{'role': 'system', 'content': ''}, {'role': '...",2822231,4128125


In [48]:
nan_values_per_column = df_combined.isna().sum()
print(nan_values_per_column)
len(df_combined)

confidence        0
interaction       0
sol_id            0
interaction_id    0
dtype: int64


10560

In [49]:
# sort df_combined based on sol_id
df_combined = df_combined.sort_values(by=["sol_id"])
df_combined

Unnamed: 0,confidence,interaction,sol_id,interaction_id
2632,2,"[{'role': 'system', 'content': 'Answer in engl...",1000851,4747078
2725,4,"[{'role': 'system', 'content': 'You are a phys...",1000851,4390722
2395,4,"[{'role': 'system', 'content': 'QCM'}, {'role'...",1001199,4967894
2575,5,"[{'role': 'user', 'content': 'Pour encoder la ...",1001199,4944321
2485,4,"[{'role': 'system', 'content': 'QCM'}, {'role'...",1001199,4039807
...,...,...,...,...
6734,1,"[{'role': 'system', 'content': 'En supposant q...",2999633,4967912
6819,5,"[{'role': 'system', 'content': 'Act as a compu...",2999633,4198754
469,5,"[{'role': 'system', 'content': 'Could you plea...",2999828,4834259
403,4,"[{'role': 'system', 'content': 'From now on, y...",2999828,4823002


## create new json

In [50]:
df_combined.to_json("interactions_v2.json", orient="records", indent=4)