## This file implements a splitter for the training set, to avoid hitting OpenAI rate limits.

### Install Libraries

In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


### Imports

In [2]:
import os
import pandas as pd

### Read training dataset (EN)

In [3]:
df_train = pd.read_csv('data/train.csv')
df_train = df_train.drop(columns=['Unnamed: 0'])

In [4]:
df_train

Unnamed: 0,context,question,answerA,answerB,answerC,correct
0,"Even though she had homework to do that night,...",What will Jesse want to do next?,read homework to Skylar,help Skylar finish,skip her studying,B
1,"After school, Casey met the friend at a bar so...",Why did Casey do this?,have a good idea of the material,goof around with a friend,have a few drinks and leave,A
2,Jesse went quickly to their mother and their m...,How would Jesse feel afterwards?,wasting their time,that they are a good child,that their mother always calls them,B
3,Robin knew that Kai really wanted her to the l...,Why did Robin do this?,paid her to say that she liked it,she never really liked Kai or her fashion,she knew Kai wanted Robin to like the outfit,C
4,Addison slept well last night after playing ba...,Why did Addison do this?,regain her energy,hit a home run,run the bases,A
...,...,...,...,...,...,...
33405,Quinn had a research paper due at midnight. Qu...,What will Quinn want to do next?,wait till the elast minute,start the project at midnight,try to finish their paper,C
33406,Ash made some new friends on the first day in ...,Why did Ash do this?,grab a coke with his new friends,do HW with his new friend,be sociable,C
33407,Carson decided to try online dating so he foun...,What does Carson need to do before this?,get married before this,decide to date before this,finish school before this,B
33408,Jesse put the turkey in the fridge for it to t...,How would Others feel as a result?,hungry,thirsty,anticipatory,C


### Split the source language training set in chunks of 5k rows

In [5]:
def df_splitter(df):
    chunk_size = 5000
    chunks = []

    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        chunks.append(chunk)

    # create new DataFrames for each chunk
    df_chunks = [pd.DataFrame(chunk) for chunk in chunks]

    # rename the DataFrames for clarity
    for i, chunk in enumerate(df_chunks):
        chunk.name = f"df_chunk_{i+1}"
    
    return df_chunks

df_train_sets = df_splitter(df_train)

In [13]:
i = 1
for df in df_train_sets:
    print(f"train set {i} split rows = {len(df)}")
    # Save to filesystem
    file_path = 'data/train_sets/train_' + str(i) + '.csv'
    df.to_csv(file_path)
    i = i + 1

train set 1 split rows = 5000
train set 2 split rows = 5000
train set 3 split rows = 5000
train set 4 split rows = 5000
train set 5 split rows = 5000
train set 6 split rows = 5000
train set 7 split rows = 3410


### Join the target language training sets and split

In [14]:
df_train1_pt = pd.read_csv('translated/marian_mt/train_pt.csv')
df_train2_pt = pd.read_csv('translated/unicamp_t5/train_pt.csv')
df_train3_pt = pd.read_csv('translated/nllb_1.3b/train_pt.csv')

# clean up data
df_train1_pt = df_train1_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
df_train2_pt = df_train2_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'correct'])
df_train3_pt = df_train3_pt.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'correct'])

# Rename columns
df_train1_pt = df_train1_pt.rename(columns={'context': 'context_1', 'question': 'question_1',
                                  'answerA': 'answerA_1', 'answerB': 'answerB_1',
                                  'answerC': 'answerC_1'})
df_train2_pt = df_train2_pt.rename(columns={'context': 'context_2', 'question': 'question_2',
                                  'answerA': 'answerA_2', 'answerB': 'answerB_2',
                                  'answerC': 'answerC_2'})
df_train3_pt = df_train3_pt.rename(columns={'context': 'context_3', 'question': 'question_3',
                                  'answerA': 'answerA_3', 'answerB': 'answerB_3',
                                  'answerC': 'answerC_3'})
# Merge the three dataframes
df_train_pt = pd.concat((df_train1_pt, df_train2_pt, df_train3_pt), axis=1)
df_train_pt = df_train_pt[['context_1', 'context_2', 'context_3',
                 'question_1', 'question_2', 'question_3',
                 'answerA_1', 'answerA_2', 'answerA_3',
                 'answerB_1', 'answerB_2', 'answerB_3',
                 'answerC_1', 'answerC_2', 'answerC_3',
                 'correct']]

In [15]:
df_train_pt_sets = df_splitter(df_train_pt)

In [16]:
i = 1
for df in df_train_pt_sets:
    print(f"PT train set {i} split rows = {len(df)}")
    # Save to filesystem
    file_path = 'translated/train_sets/train_pt_' + str(i) + '.csv'
    df.to_csv(file_path)
    i = i + 1

PT train set 1 split rows = 5000
PT train set 2 split rows = 5000
PT train set 3 split rows = 5000
PT train set 4 split rows = 5000
PT train set 5 split rows = 5000
PT train set 6 split rows = 5000
PT train set 7 split rows = 3410
