In [7]:
# Author : Daniel Parada

# Import necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split

def df_split(csv_file, out_path, dep_var, test_size=0.33, shuffle=False, random_state=0):
    """Reads in a csv file and splits it into train/test datasets with corresponding ground truths
       and saves them to csv files.
       
    Args:
        csv_file : String ; path to csv file to split into train/test data
        out_path : String ; output path to save csv to
        dep_var : String ; dependent variable we are trying to stimate
        test_size : float ; percentage of the data to be used for testing
        shuffle : bool ; determines if the selected data should be shuffled or not
        random_state : int ; determines how data is split, used for reproducibility
        
    Returns:
        None
    """
    
    # Read in the data into a pandas DataFrame
    dat = pd.read_csv(csv_file)

    # Split the data between explanatory variables and dependent variable
    y = dat[dep_var]
    X = dat.drop(['Id', dep_var], axis=1)

    # Split the data into train/test feature sets and corresponding ground truths
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, shuffle=shuffle, random_state=random_state)
    
    # Save train data to csv
    X_train.to_csv(out_path + 'df_Train.csv', header=True)

    # Save train ground truth to csv
    y_train.to_csv(out_path + 'df_TrainGt.csv', header=True)

    # Save test data to csv
    X_test.to_csv(out_path + 'df_Test.csv', header=True)

    # Save test ground truth to csv
    y_test.to_csv(out_path + 'df_TestGt.csv', header=True)
