# The purpose of this utility is to make general purpose data transformations and gathered in one single place.

## Porposed utilities:

### Dataset shrink

Takes an input dataset and create subsets with the information presented


#### Libraries:

In [None]:
import pandas as pd
from math import floor
import os

data_path = "../data"
processed_data_path = "../processed_data"

#### Functions:

In [None]:
def create_sub_files(file_name: str, partition_list, partition_mode: str="all", data_path: str="../data"):
    """
    This function reads a csv file and creates sub files based on the partition list and partition mode
    partition_list: list of partition values. Each partition should be between 0 and 1
    if partition mode is "one", then it creates one file for each partition value
    if partition mode is "all", then it creates 1/partition value files plus one file with the remaining data
        example: if partition_list = [0.2, 0.3] and partition_mode = "all", then it creates 9 files
        1. 20% of the data will be created in each file (5 files)
            The naming convention will be:
                file_name_2000_1_of_5.csv
                file_name_2000_2_2_of_5.csv
                file_name_2000_2_3_of_5.csv
                file_name_2000_2_4_of_5.csv
                file_name_2000_2_5_of_5.csv
        2. 45% of the data will be created in each file + 10% in a remainded file (4 files) 
            The naming convention will be:
                file_name_4500_1_of_3.csv
                file_name_4500_2_of_3.csv
                file_name_4500_3_of_3.csv
    """
    print(f"Creating sub files for {file_name}, partition_list: {partition_list}, partition_mode: {partition_mode}")
    assert file_name is not None and file_name != "", "File name should not be empty"
    assert "/" not in file_name and "\\" not in file_name, "File name should not contain path"
    file_split = file_name.split(".")
    assert len(file_split) == 2, "File name should have one extension only"
    for i in partition_list:
        assert i >= 0 and i <= 1, "Partition value should be between 0 and 1"
    assert partition_mode in ["all", "one"], "Partition mode should be either all or one"

    df = pd.read_csv(f"{data_path}/{file_name}")
    if partition_mode == "one":
        for i in partition_list:
            partition_str = f"{int(i*10000):05}"
            df1 = df.sample(frac=i)
            df1.to_csv(f"{processed_data_path}/{file_split[0]}_{partition_str}.{file_split[1]}", index=False)
    elif partition_mode == "all":
        for i in partition_list:
            partition_str = f"{int(i*10000):05}"
            df = df.sample(frac=1).reset_index(drop=True)
            n_rows = floor(i * len(df))
            num_files = floor(1/i) if floor(1/i)*n_rows == len(df) else floor(1/i) + 1
            for j in range(floor(1/i)):
                df1 = df.iloc[j*n_rows:(j+1)*n_rows]
                df1.to_csv(f"{processed_data_path}/{file_split[0]}_{partition_str}_{j+1}_of_{num_files}.{file_split[1]}", index=False)
            if (floor(1/i))*n_rows < len(df):
                df1 = df.iloc[(floor(1/i))*n_rows:]
                df1.to_csv(f"{processed_data_path}/{file_split[0]}_{partition_str}_{num_files}_of_{num_files}.{file_split[1]}", index=False)


def kfold_split(file_name: str, k: int, data_path:str="../data"):
    """
    This function reads a csv file and creates k files with the same number of rows
        File format:
            file_name_1_of_5.csv
    """
    print(f"Creating kfold split for {file_name}, k: {k}")
    assert file_name is not None and file_name != "", "File name should not be empty"
    assert "/" not in file_name and "\\" not in file_name, "File name should not contain path"
    file_split = file_name.split(".")
    assert len(file_split) == 2, "File name should have one extension only"
    assert k > 1, "k should be greater than 1"

    df = pd.read_csv(f"{data_path}/{file_name}").sample(frac=1)
    df["TEMP_KFOLD"] = ((df.index) % k)
    for i in range(k):
        df1 = df[df["TEMP_KFOLD"] == i]
        df1 = df1.drop(columns=["TEMP_KFOLD"])
        df1.to_csv(f"{processed_data_path}/{file_split[0]}_{i+1}_of_{k}.{file_split[1]}", index=False)



#### Test cases:

In [None]:
# create test cases for the functions above.
train_file = "sales_train.csv"
test_file = "sales_test.csv"


def test_create_sub_files_one():
    create_sub_files(test_file, [0.2, 0.3], "one", data_path)
    assert os.path.exists(f"{processed_data_path}/sales_test_02000.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_03000.csv")

def test_create_sub_files_all():
    create_sub_files(test_file, [0.45], "all", data_path)
    assert os.path.exists(f"{processed_data_path}/sales_test_04500_1_of_3.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_04500_2_of_3.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_04500_3_of_3.csv")

def test_kfold_split():
    kfold_split(test_file, 5, data_path)
    assert os.path.exists(f"{processed_data_path}/sales_test_1_of_5.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_2_of_5.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_3_of_5.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_4_of_5.csv")
    assert os.path.exists(f"{processed_data_path}/sales_test_5_of_5.csv")

def test_clean_up():
    os.remove(f"{processed_data_path}/sales_test_02000.csv")
    os.remove(f"{processed_data_path}/sales_test_03000.csv")
    os.remove(f"{processed_data_path}/sales_test_04500_1_of_3.csv")
    os.remove(f"{processed_data_path}/sales_test_04500_2_of_3.csv")
    os.remove(f"{processed_data_path}/sales_test_04500_3_of_3.csv")
    os.remove(f"{processed_data_path}/sales_test_1_of_5.csv")
    os.remove(f"{processed_data_path}/sales_test_2_of_5.csv")
    os.remove(f"{processed_data_path}/sales_test_3_of_5.csv")
    os.remove(f"{processed_data_path}/sales_test_4_of_5.csv")
    os.remove(f"{processed_data_path}/sales_test_5_of_5.csv")


#### Running test cases:

In [None]:
test_handler = [test_create_sub_files_one, test_create_sub_files_all, test_kfold_split]
for test_case in test_handler:
    test_case()
    print(f"{test_case.__name__} passed")

#### Test cases clean up:

In [None]:
test_clean_up()

#### Project processing:

In [None]:
# processing for kaggle rohlik competition
create_sub_files("sales_train.csv", [0.01, 0.1, 0.3], "one")
create_sub_files("sales_test.csv", [0.01, 0.1, 0.3], "one")