# Imports

In [41]:
import pandas as pd
import os
from scripts.utils import *
from sklearn.model_selection import train_test_split

PREPROCESSED_DATASETS_PATH = 'data/preprocessed_datasets'

# Preprocessing

In [42]:
# get all csv files in the preprocessed datasets folder
csv_files = [f for f in os.listdir(PREPROCESSED_DATASETS_PATH) if f.endswith('.csv')]
print(csv_files)

['insurance.csv', 'melb_data.csv', 'SeoulBikeData.csv', 'Sleep_Efficiency.csv', 'uci_1.csv', 'winequalityN.csv']


In [43]:
for csv_file in csv_files:
    print(f'Preprocessing {csv_file}...')

    try:
        df = pd.read_csv(f'{PREPROCESSED_DATASETS_PATH}/{csv_file}', encoding='utf-8')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(f'{PREPROCESSED_DATASETS_PATH}/{csv_file}', encoding='ISO-8859-1')
        except UnicodeDecodeError:
            df = pd.read_csv(f'{PREPROCESSED_DATASETS_PATH}/{csv_file}', encoding='cp1252')

    # get all rows with nan values 
    nan_rows = df[df.isnull().any(axis=1)]
    print(f'{csv_file} has {nan_rows.shape[0]} rows with nan values')

    # drop rows with nan values
    df = df.dropna()

    # target column is the last column
    target_column = df.columns[-1]

    custom_value_mappings = {
        'True': 1, 'False': 0,  # Assuming your dataframe uses string 'True'/'False'
        True: 1, False: 0,  # Direct boolean values
        'Yes': 1, 'No': 0,
        'yes': 1, 'no': 0,
        'Holiday': 1, 'No Holiday': 0,
        'male': 1, 'female': 0,
        'Male': 1, 'Female': 0,
        'red' :1, 'white': 0,
    }

    custom_column_mappings = {
        'sex': 'is_male',
        'smoker': 'is_smoker',
        'Holiday': 'is_holiday',
        'Functioning Day': 'is_functioning_day',
        'Smoking status': 'is_smoking',
        'Gender': 'is_male',
        'type': 'is_red'
    }


    # Apply custom mappings for binary columns
    for column in df.columns[:-1]:  # Exclude target column from this preprocessing
        if set(df[column].unique()).issubset(set(custom_value_mappings.keys())):
            df[column] = df[column].map(custom_value_mappings)

            new_name = custom_column_mappings[column] if column in custom_column_mappings else column

            print(f'Applying custom mapping for column {column} in {csv_file}')

            df = df.rename(columns={column: new_name})

    


    # Convert non-numeric columns to categorical columns
    df = pd.get_dummies(df)

    # make all TRUE 1 and FALSE 0
    df = df.replace({True: 1, False: 0})

    # put the target column at the end
    df = df[[column for column in df if column != target_column] + [target_column]]


    # Remove columns with only 1 unique value
    for column in df.columns:
        if len(df[column].unique()) == 1:
            print(f'Removing column {column} from {csv_file} because it has only 1 unique value')
            df = df.drop(column, axis=1)

    # Save preprocessed dataset to /data/procesed_datasets
    df.to_csv(f'data/processed_datasets/{csv_file}', index=False)
    print('\n')


Preprocessing insurance.csv...
insurance.csv has 0 rows with nan values
Applying custom mapping for column sex in insurance.csv
Applying custom mapping for column smoker in insurance.csv


Preprocessing melb_data.csv...
melb_data.csv has 6750 rows with nan values


Preprocessing SeoulBikeData.csv...
SeoulBikeData.csv has 0 rows with nan values
Applying custom mapping for column Holiday in SeoulBikeData.csv
Applying custom mapping for column Functioning Day in SeoulBikeData.csv


Preprocessing Sleep_Efficiency.csv...
Sleep_Efficiency.csv has 64 rows with nan values
Applying custom mapping for column Gender in Sleep_Efficiency.csv
Applying custom mapping for column Smoking status in Sleep_Efficiency.csv


Preprocessing uci_1.csv...
uci_1.csv has 0 rows with nan values


Preprocessing winequalityN.csv...
winequalityN.csv has 34 rows with nan values
Applying custom mapping for column type in winequalityN.csv




# Train-Test Split

In [44]:
if not os.path.exists('data/split_datasets'):
    os.makedirs('data/split_datasets')
    

In [45]:
for csv_file in csv_files:
    name = csv_file.split('.')[0]
    name = name_to_pretty_name(name)
    # create a folder for each dataset
    if not os.path.exists(f'data/split_datasets/{name}'):
        os.makedirs(f'data/split_datasets/{name}')
    df = pd.read_csv(f'data/processed_datasets/{csv_file}')

    # split into 20 % test and 80 % train
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train.to_csv(f'data/split_datasets/{name}/train.csv', index=False)
    test.to_csv(f'data/split_datasets/{name}/test.csv', index=False)
