In [51]:
from os import listdir
from re import compile, match
from random import sample
from pandas import DataFrame, read_csv, concat

In [42]:
DATA_PATH = "./data/train/defog/"

In [34]:
def split_data_paths(path: str, split_ratio: float):
    """
    Get paths to csv and split them into train and test set
    
    params:
        paths (str): paths to csv
        split_ration (float): ratio of train-test split
    
    returns:
        train_set_paths (list): paths to training set
        test_set_paths (list): paths to test set
    """
    
    csv_regex = compile("\w+\.csv")
    csv_paths = listdir(path)
    csv_paths = [tmp_path for tmp_path in csv_paths if csv_regex.match(tmp_path)]
    paths_len = len(csv_paths)
    
    train_set_paths = sample(csv_paths, round(split_ratio * paths_len))
    test_set_paths = list(set(csv_paths) - set(train_set_paths))

    return train_set_paths, test_set_paths

In [56]:
def read_data(paths: list):
    """
    For each patient read csv and combine it into one data frame
    
    params:
        paths (list): list of csv files
    
    return:
        results (pd.DataFrame): combined data frame of all patients in set
    """
    
    results = DataFrame()
    
    for path in paths:
        tmp_df = read_csv(DATA_PATH + path)
        tmp_df['patient_id'] = path.replace(".csv", "")
        results = concat([results, tmp_df])
    
    return results

In [57]:
train_paths, test_paths = split_data_paths(path=DATA_PATH, split_ratio=0.8)

In [58]:
train_set = read_data(paths=train_paths)
test_set = read_data(paths=test_paths)