In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd

def preprocess_data(data, target_column, save_path, file_path):
    # Menentukan fitur numerik dan kategorikal
    categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = data.select_dtypes(exclude=['object']).columns.tolist()
    column_names = data.columns

    # Mendapatkan nama kolom tanpa kolom target
    column_names = data.columns.drop(target_column)

    # Membuat DataFrame kosong untuk menyimpan kolom yang akan digunakan
    df_header = pd.DataFrame(columns=column_names)

    # Menyimpan nama kolom sebagai header tanpa data
    df_header.to_csv(file_path, index=False)
    print(f"Header dataframe berhasil disimpan ke: {file_path}")

    # Pastikan target_column tidak ada di numeric_features atau categorical_features
    if target_column in numerical_cols:
        numerical_cols.remove(target_column)
    if target_column in categorical_cols:
        categorical_cols.remove(target_column)

    # Pipeline untuk transformasi numerik
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ])

    # Pipeline untuk transformasi kategorikal
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # ColumnTransformer untuk menggabungkan transformasi numerik dan kategorikal
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Memisahkan fitur dan target
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Membagi data menjadi set pelatihan dan pengujian
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit dan transformasi preprocessor pada data pelatihan
    X_train = preprocessor.fit_transform(X_train)
    print("Preprocessor telah di-fit pada data pelatihan.")
    # Transformasi data pelatihan dan pengujian
    X_test = preprocessor.transform(X_test)
    print("Data pengujian telah ditransformasikan.")
    # Simpan preprocessor ke file
    dump(preprocessor, save_path)
    print(f"Preprocessor berhasil disimpan ke: {save_path}")

    return X_train, X_test, y_train, y_test

In [17]:
# Memuat data dari file CSV
data = pd.read_csv('../heart_raw.csv')

# Menentukan kolom target
target_column = 'HeartDisease'

data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [18]:
# Contoh Penggunaan
X_train, X_test, y_train, y_test = preprocess_data(data, 'HeartDisease', 'preprocessor_pipeline.joblib', 'data.csv')

Header dataframe berhasil disimpan ke: data.csv
Preprocessor telah di-fit pada data pelatihan.
Data pengujian telah ditransformasikan.
Preprocessor berhasil disimpan ke: preprocessor_pipeline.joblib


In [19]:
from joblib import dump, load

def inference(new_data, load_path):
    # Memuat pipeline preprocessing
    preprocessor = load(load_path)
    print(f"Pipeline preprocessing dimuat dari: {load_path}")

    # Transformasi data baru
    transformed_data = preprocessor.transform(new_data)
    return transformed_data

In [24]:
def inverse_transform_data(transformed_data, load_path, new_data_columns):
    """
    Mengembalikan data hasil transformasi ke bentuk aslinya (baik numerik maupun kategorikal).
    """

    preprocessor = load(load_path)
    # Mendapatkan urutan kolom numerik dan kategorikal
    numeric_cols = preprocessor.transformers_[0][2]
    categorical_cols = preprocessor.transformers_[1][2]

    # Ambil hasil transformasi numerik dan kategorikal
    n_num = len(numeric_cols)
    n_cat = len(categorical_cols)
    X_num = transformed_data[:, :n_num]
    X_cat = transformed_data[:, n_num:n_num+n_cat]

    # Inverse transform numerik
    scaler = preprocessor.named_transformers_['num']['scaler']
    original_numeric_data = scaler.inverse_transform(X_num)

    # Inverse transform kategorikal
    encoder = preprocessor.named_transformers_['cat']['encoder']
    # OrdinalEncoder menghasilkan angka, kita kembalikan ke kategori aslinya
    original_categorical_data = encoder.inverse_transform(X_cat)

    # Gabungkan kembali ke DataFrame
    df_num = pd.DataFrame(original_numeric_data, columns=numeric_cols)
    df_cat = pd.DataFrame(original_categorical_data, columns=categorical_cols)
    inversed_data = pd.concat([df_num, df_cat], axis=1)
    # Urutkan kolom sesuai new_data_columns jika ingin
    inversed_data = inversed_data.reindex(columns=new_data_columns)
    return inversed_data

In [25]:
import numpy as np
# Jalankan preprocessing
pipeline_path = 'preprocessor_pipeline.joblib'
col = pd.read_csv('data.csv')
# Daftar data
new_data = [
    45, # Age (int)
    'F', # Sex
    'NAP', # ChestPainType
    130,  # RestingBP
    220, # Cholesterol 
    0, # FastingBS
    'ST', # RestingECG
    160, # MaxHR
    'N', # ExerciseAngina
    0.5, # Oldpeak
    'Up' # ST_Slope
]

# Mengubah menjadi numpy.ndarray
new_data = np.array(new_data)

new_data = pd.DataFrame([new_data], columns=col.columns)
# Lakukan inference
transformed_data = inference(new_data, pipeline_path)

# Inverse transform data
inversed_data = inverse_transform_data(transformed_data, pipeline_path, new_data.columns)

# Output hasil preprocessing dan inference
print("Data setelah preprocessing (training):")
print(new_data)
print("\nData baru setelah transformasi:")
print(transformed_data)
print("\nData setelah inverse transform:")
print(inversed_data)

Pipeline preprocessing dimuat dari: preprocessor_pipeline.joblib
Data setelah preprocessing (training):
  Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR  \
0  45   F           NAP       130         220         0         ST   160   

  ExerciseAngina Oldpeak ST_Slope  
0              N     0.5       Up  

Data baru setelah transformasi:
[[0.33333333 0.65       0.36484245 0.         0.70422535 0.35227273
  0.         2.         2.         0.         2.        ]]

Data setelah inverse transform:
    Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  \
0  45.0   F           NAP      130.0        220.0        0.0         ST   

   MaxHR ExerciseAngina  Oldpeak ST_Slope  
0  160.0              N      0.5       Up  
