# Resizing Files
- This notebook resizes each category of generated data into four smaller files for better processed by GPT
- It also include the train-dev-test split process

In [6]:
import glob
import pandas as pd
import re
import os
import random

## 1. Resizing Files

The resizing step mainly uses code from the following GitHub repository: [ICF-activities-classifier](https://github.com/cltl-students/ICF-activities-classifier)

In [7]:
def cut_size(lst, sep = "-"*20):
    """"""
    match = [x for x in lst if x == sep]
    num = len(match)
    count = 0
    lst_1 = []
    lst_2 = []
    lst_3 = []
    lst_4 = []
    for e in lst:
        if count < num/4:
            if e == "-"*20:
                count += 1 
            lst_1.append(e)
        elif num/2 > count >= num/4:
            if e == "-"*20:
                count += 1 
            lst_2.append(e)
        elif (num/4)*3 > count >= num/2:
            if e == "-"*20:
                count += 1 
            lst_3.append(e)
        else:
            if e == "-"*20:
                count += 1 
            lst_4.append(e)
    print(num == count)
    return lst_1, lst_2, lst_3, lst_4

In [8]:
dir = './response_data/'
files = glob.glob(f"{dir}clean_conversations/*.tsv")

if not os.path.exists(f"{dir}resized_files"):
    os.makedirs(f"{dir}resized_files")


for f in files:
    df = pd.read_csv(f, sep="\t")
    lst = df["utterances"].tolist()
    l1, l2, l3, l4 = cut_size(lst)
    df_1 = pd.DataFrame({"utterances":l1})
    df_2 = pd.DataFrame({"utterances": l2})
    df_3 = pd.DataFrame({"utterances": l3})
    df_4 = pd.DataFrame({"utterances": l4})
    fileparts = f.split("/")
    name = fileparts[-1].rstrip(".tsv")
    l1_path = f"{dir}resized_files/"+ name + "_1" + ".tsv"
    l2_path = f"{dir}resized_files/"+ name + "_2" + ".tsv"
    l3_path = f"{dir}resized_files/"+ name + "_3" + ".tsv"
    l4_path = f"{dir}resized_files/"+ name + "_4" + ".tsv"
    df_1.to_csv(l1_path, sep="\t")
    df_2.to_csv(l2_path, sep="\t")
    df_3.to_csv(l3_path, sep="\t")
    df_4.to_csv(l4_path, sep="\t")


True
True
True


In [13]:
def cut_size(lst, sep="-"*20):
    match = [x for x in lst if x == sep]
    num = len(match)
    count = 0
    lst_1, lst_2, lst_3, lst_4 = [], [], [], []

    for e in lst:
        if count < num/4:
            if e == sep:
                count += 1
            lst_1.append(e)
        elif num/2 > count >= num/4:
            if e == sep:
                count += 1
            lst_2.append(e)
        elif (num/4)*3 > count >= num/2:
            if e == sep:
                count += 1
            lst_3.append(e)
        else:
            if e == sep:
                count += 1
            lst_4.append(e)
    return lst_1, lst_2, lst_3, lst_4

def extract_dev_test(lst, sep="-"*20, num_conversations=18):
    '''
    This function extracts a number of conversations from a list of conversations, which will be used as dev+test data.
    It takes a list of conversations, a separator, and the number of conversations to extract.
    It returns the selected conversations and the remaining conversations.
    '''
    conversations = []
    current_conversation = []
    for line in lst:
        current_conversation.append(line)
        if line == sep:
            conversations.append(current_conversation)
            current_conversation = []
    if current_conversation:
        conversations.append(current_conversation)

    training_set = random.sample(conversations, num_conversations)
    dev_test_set = [c for c in conversations if c not in training_set]

    trainingset = [item for sublist in training_set for item in sublist]
    devset = [item for sublist in dev_test_set for item in sublist]

    return trainingset, devset

## 2. Train-dev-test Split

In [14]:
# Randomly pick 18 conversations from each category to use as test data, 18 as dev data, and the rest as training data
dir = './response_data/'
files = glob.glob(f"{dir}clean_conversations/*.tsv")

if not os.path.exists(f"{dir}resized_files"):
    os.makedirs(f"{dir}resized_files")

test_conversations = []
dev_conversations = []

for f in files:
    df = pd.read_csv(f, sep="\t")
    lst = df["utterances"].tolist()

    testset, trainingset = extract_dev_test(lst)
    test_conversations.extend(testset)

    devset, trainingset = extract_dev_test(trainingset)
    dev_conversations.extend(devset)

    l1, l2, l3, l4 = cut_size(trainingset)

    df_1 = pd.DataFrame({"utterances": l1})
    df_2 = pd.DataFrame({"utterances": l2})
    df_3 = pd.DataFrame({"utterances": l3})
    df_4 = pd.DataFrame({"utterances": l4})

    fileparts = f.split("/")
    name = fileparts[-1].rstrip(".tsv")
    l1_path = f"{dir}resized_files/" + name + "_1" + ".tsv"
    l2_path = f"{dir}resized_files/" + name + "_2" + ".tsv"
    l3_path = f"{dir}resized_files/" + name + "_3" + ".tsv"
    l4_path = f"{dir}resized_files/" + name + "_4" + ".tsv"

    df_1.to_csv(l1_path, sep="\t", index=False)
    df_2.to_csv(l2_path, sep="\t", index=False)
    df_3.to_csv(l3_path, sep="\t", index=False)
    df_4.to_csv(l4_path, sep="\t", index=False)

# Save the selected conversations to test.tsv
test_df = pd.DataFrame({"utterances": test_conversations})
test_df.to_csv(f"{dir}resized_files/test.tsv", sep="\t", index=False)

# Save the selected conversations to dev.tsv
dev_df = pd.DataFrame({"utterances": dev_conversations})
dev_df.to_csv(f"{dir}resized_files/dev.tsv", sep="\t", index=False)
