In [44]:
import pandas as pd
import numpy as np
import importlib
import config
from pathlib import Path

try:
    importlib.reload(config) # reload module
except NameError:
    pass

# Load DF

In [45]:
df = pd.read_csv(config.ORIGINAL_FILEPATH)
df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


# Clean DF

In [46]:
def clean_df(df):
    return (df
        .rename(columns=lambda c:c.lower().replace(" ", '_').replace("class", "target"))
        .drop(columns="id")
    )

df_clean =clean_df(df)

# Split to Train and Test Set

In [47]:
def create_train_test_split(df, train_frac):
    df_shuffled = (df
        .sample(frac=1, replace=False, random_state=8, ignore_index=True)
    )
    num_train = int(np.floor(df_shuffled.shape[0] * train_frac))
    train_set = df.iloc[:num_train]
    test_set = df.iloc[num_train:]
    assert df_shuffled.shape[0] == (train_set.shape[0] + test_set.shape[0])
    return train_set, test_set

train_set, test_set = create_train_test_split(df_clean, 0.9)
print(f"Train set shape:{train_set.shape}")
print(f"Train set shape:{test_set.shape}")

Train set shape:(511767, 30)
Train set shape:(56863, 30)


In [48]:
train_set.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,target
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


# Save DF

In [49]:
if not (train_path := Path(config.TRAIN_FILEPATH)).exists():
    df_clean.to_parquet(train_path)
    print("Saved.")

if not (test_path := Path(config.TEST_FILEPATH)).exists():
    df_clean.to_parquet(test_path)
    print("Saved.")

Saved.
Saved.
