<a href="https://colab.research.google.com/github/faisalrizqin/Data-Mining-Tugas-Preprocessing-Data/blob/main/DM_Tugas_Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =====================================
# DATA MINING – Preprocessing Data
# Dataset : Heart Disease (Processed Cleveland) - UCI Machine Learning Repository
# =====================================

In [3]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [34]:
# Importing the dataset
# File 'cleveland.data' tidak memiliki header, jadi kita tambahkan manual berdasarkan heart-disease.names
# Tanda '?' diubah menjadi NaN agar bisa ditangani dengan SimpleImputer
dataset = pd.read_csv('processed.cleveland.data', header=None, na_values='?')

In [87]:
# nama kolom
dataset.columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
    'ca', 'thal', 'target'
]

In [35]:
# Cek missing value
print("===== Jumlah Missing Value per Kolom =====")
print(dataset.isnull().sum(), "\n")

===== Jumlah Missing Value per Kolom =====
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    4
12    2
13    0
dtype: int64 



In [66]:
# Tampilkan hanya baris yang memiliki missing value
missing_data = dataset[dataset.isnull().any(axis=1)]

print("===== Data yang Memiliki Missing Value =====")
print(missing_data)
print("\nJumlah data dengan missing value:", len(missing_data))

# Simpan index baris yang punya missing value agar bisa dilihat lagi setelah imputasi
missing_index = missing_data.index

===== Data yang Memiliki Missing Value =====
       0    1    2      3      4    5    6      7    8    9    10   11   12  \
87   53.0  0.0  3.0  128.0  216.0  0.0  2.0  115.0  0.0  0.0  1.0  0.0  NaN   
166  52.0  1.0  3.0  138.0  223.0  0.0  0.0  169.0  0.0  0.0  1.0  NaN  3.0   
192  43.0  1.0  4.0  132.0  247.0  1.0  2.0  143.0  1.0  0.1  2.0  NaN  7.0   
266  52.0  1.0  4.0  128.0  204.0  1.0  0.0  156.0  1.0  1.0  2.0  0.0  NaN   
287  58.0  1.0  2.0  125.0  220.0  0.0  0.0  144.0  0.0  0.4  2.0  NaN  7.0   
302  38.0  1.0  3.0  138.0  175.0  0.0  0.0  173.0  0.0  0.0  1.0  NaN  3.0   

     13  
87    0  
166   0  
192   1  
266   2  
287   0  
302   0  

Jumlah data dengan missing value: 6


In [67]:
# Pisahkan fitur (X) dan label (Y)
x = dataset.iloc[:, :-1].values   # semua kolom kecuali target
y = dataset.iloc[:, -1].values    # kolom target

In [68]:
pd.set_option('display.max_columns', None)  # tampilkan semua kolom
pd.set_option('display.width', None)        # agat tidak dipotong per baris
pd.set_option('display.max_rows', 20)       # tampilkan maksimal 20 baris saja (opsional)

print(pd.DataFrame(x, columns=dataset.columns[:-1]))

       0    1    2      3      4    5    6      7    8    9    10   11   12
0    63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.3  3.0  0.0  6.0
1    67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.5  2.0  3.0  3.0
2    67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.6  2.0  2.0  7.0
3    37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.5  3.0  0.0  3.0
4    41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.4  1.0  0.0  3.0
..    ...  ...  ...    ...    ...  ...  ...    ...  ...  ...  ...  ...  ...
298  45.0  1.0  1.0  110.0  264.0  0.0  0.0  132.0  0.0  1.2  2.0  0.0  7.0
299  68.0  1.0  4.0  144.0  193.0  1.0  0.0  141.0  0.0  3.4  2.0  2.0  7.0
300  57.0  1.0  4.0  130.0  131.0  0.0  0.0  115.0  1.0  1.2  2.0  1.0  7.0
301  57.0  0.0  2.0  130.0  236.0  0.0  2.0  174.0  0.0  0.0  2.0  1.0  3.0
302  38.0  1.0  3.0  138.0  175.0  0.0  0.0  173.0  0.0  0.0  1.0  NaN  3.0

[303 rows x 13 columns]


In [69]:
print(y)

[0 2 1 0 0 0 3 0 2 1 0 0 2 0 0 0 1 0 0 0 0 0 1 3 4 0 0 0 0 3 0 2 1 0 0 0 3
 1 3 0 4 0 0 0 1 4 0 4 0 0 0 0 2 0 1 1 1 1 0 0 2 0 1 0 2 2 1 0 2 1 0 3 1 1
 1 0 1 0 0 3 0 0 0 3 0 0 0 0 0 0 0 3 0 0 0 1 2 3 0 0 0 0 0 0 3 0 2 1 2 3 1
 1 0 2 2 0 0 0 3 2 3 4 0 3 1 0 3 3 0 0 0 0 0 0 0 0 4 3 1 0 0 1 0 1 0 1 4 0
 0 0 0 0 0 4 3 1 1 1 2 0 0 4 0 0 0 0 0 0 1 0 3 0 1 0 4 1 0 1 0 0 3 2 0 0 1
 0 0 2 1 2 0 3 1 2 0 3 0 0 0 1 0 0 0 0 0 3 3 3 0 1 0 4 0 3 1 0 0 0 0 0 0 0
 0 3 1 0 0 0 3 2 0 2 1 0 0 3 2 1 0 0 0 0 0 2 0 2 2 1 3 0 0 1 0 0 0 0 0 0 0
 1 0 3 0 0 4 2 2 2 1 0 1 0 2 0 1 0 0 0 1 0 2 0 3 0 2 4 2 0 0 0 1 0 2 2 1 0
 3 1 1 2 3 1 0]


In [70]:
# Menangani Missing Value
# Gunakan SimpleImputer untuk mengganti nilai NaN dengan rata-rata kolom
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
x = imputer.fit_transform(x)

In [71]:
# Ubah kembali ke DataFrame agar bisa ditampilkan dengan nama kolom
x_df = pd.DataFrame(x, columns=dataset.columns[:-1])

# Tampilkan baris yang dulu punya missing value, setelah diganti rata-rata
print("===== Data Setelah Missing Value Diganti Dengan Nilai Rata-rata =====")
print(x_df.loc[missing_index])

===== Data Setelah Missing Value Diganti Dengan Nilai Rata-rata =====
       0    1    2      3      4    5    6      7    8    9    10        11  \
87   53.0  0.0  3.0  128.0  216.0  0.0  2.0  115.0  0.0  0.0  1.0  0.000000   
166  52.0  1.0  3.0  138.0  223.0  0.0  0.0  169.0  0.0  0.0  1.0  0.672241   
192  43.0  1.0  4.0  132.0  247.0  1.0  2.0  143.0  1.0  0.1  2.0  0.672241   
266  52.0  1.0  4.0  128.0  204.0  1.0  0.0  156.0  1.0  1.0  2.0  0.000000   
287  58.0  1.0  2.0  125.0  220.0  0.0  0.0  144.0  0.0  0.4  2.0  0.672241   
302  38.0  1.0  3.0  138.0  175.0  0.0  0.0  173.0  0.0  0.0  1.0  0.672241   

           12  
87   4.734219  
166  3.000000  
192  7.000000  
266  4.734219  
287  7.000000  
302  3.000000  


In [77]:
# Encoding data kategori (Atribut)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Kolom kategori berdasarkan indeks:
# 1=sex, 2=cp, 5=fbs, 6=restecg, 8=exang, 10=slope, 11=ca, 12=thal
kategori_index = [1, 2, 5, 6, 8, 10, 11, 12]

In [89]:
# ===== Sebelum Encoding =====
print("===== Dataset Sebelum Encoding =====")
df_before = x_df.copy()
print(df_before.head())
print("\nJumlah kolom sebelum encoding:", df_before.shape[1], "\n")

===== Dataset Sebelum Encoding =====
     0    1    2      3      4    5    6      7    8    9    10   11   12
0  63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.3  3.0  0.0  6.0
1  67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.5  2.0  3.0  3.0
2  67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.6  2.0  2.0  7.0
3  37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.5  3.0  0.0  3.0
4  41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.4  1.0  0.0  3.0

Jumlah kolom sebelum encoding: 13 



In [90]:
# ===== Proses Encoding =====
# Buat dan latih encoder
encoder = OneHotEncoder()
encoder.fit(x[:, kategori_index])

# Ambil nama kolom hasil encoding dalam format yang mudah dibaca
encoded_feature_names = []
for i, col_name in enumerate(np.array(dataset.columns[:-1])[kategori_index]):
    categories = encoder.categories_[i]
    encoded_feature_names += [f"{col_name}={int(cat)}" for cat in categories]

# Gabungkan hasil encoding dan kolom numerik
ct = ColumnTransformer(
    transformers=[('encoder', encoder, kategori_index)],
    remainder='passthrough'
)
x_encoded = np.array(ct.fit_transform(x))

# Ambil kolom non-kategori (yang tidak diubah)
non_cat_columns = [col for i, col in enumerate(dataset.columns[:-1]) if i not in kategori_index]
all_feature_names = encoded_feature_names + non_cat_columns

In [91]:
# ===== Setelah Encoding =====
print("===== Dataset Setelah Encoding =====")
df_after = pd.DataFrame(x_encoded, columns=all_feature_names)
print(df_after.head())
print("\nJumlah kolom setelah encoding:", df_after.shape[1])

===== Dataset Setelah Encoding =====
   sex=0  sex=1  cp=1  cp=2  cp=3  cp=4  fbs=0  fbs=1  restecg=0  restecg=1  \
0    0.0    1.0   1.0   0.0   0.0   0.0    0.0    1.0        0.0        0.0   
1    0.0    1.0   0.0   0.0   0.0   1.0    1.0    0.0        0.0        0.0   
2    0.0    1.0   0.0   0.0   0.0   1.0    1.0    0.0        0.0        0.0   
3    0.0    1.0   0.0   0.0   1.0   0.0    1.0    0.0        1.0        0.0   
4    1.0    0.0   0.0   1.0   0.0   0.0    1.0    0.0        0.0        0.0   

   restecg=2  exang=0  exang=1  slope=1  slope=2  slope=3  ca=0  ca=0  ca=1  \
0        1.0      1.0      0.0      0.0      0.0      1.0   1.0   0.0   0.0   
1        1.0      0.0      1.0      0.0      1.0      0.0   0.0   0.0   0.0   
2        1.0      0.0      1.0      0.0      1.0      0.0   0.0   0.0   0.0   
3        0.0      1.0      0.0      0.0      0.0      1.0   1.0   0.0   0.0   
4        1.0      1.0      0.0      1.0      0.0      0.0   1.0   0.0   0.0   

   ca=2  ca=3

In [92]:
# Encoding data label (Class/Target)
# Label (target) pada dataset: 0 = sehat, 1–4 = memiliki tingkat penyakit
# Kita ubah menjadi biner (0 = sehat, 1 = sakit)
y = np.where(y > 0, 1, 0)

In [93]:
print("===== Setelah Label Encoding (y) =====")
print(y[:20], "\n")

===== Setelah Label Encoding (y) =====
[0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0] 



In [107]:
# Membagi dataset ke dalam training set dan test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1
)

In [109]:
# ===== Ubah ke DataFrame agar tampil dengan nama kolom =====
x_train_df = pd.DataFrame(x_train, columns=dataset.columns[:-1])
x_test_df = pd.DataFrame(x_test, columns=dataset.columns[:-1])
y_train_df = pd.DataFrame(y_train, columns=['target'])
y_test_df = pd.DataFrame(y_test, columns=['target'])

In [99]:
# ===== Tampilkan hasil =====
print("===== Data Training (x_train) =====")
print(x_train_df.head(10))   # tampilkan 10 baris pertama
print("\n===== Data Testing (x_test) =====")
print(x_test_df.head(10))

print("\n===== Label Training (y_train) =====")
print(y_train_df.head(10))
print("\n===== Label Testing (y_test) =====")
print(y_test_df.head(10))

===== Data Training (x_train) =====
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  58.0  1.0  4.0     128.0  216.0  0.0      2.0    131.0    1.0      2.2   
1  54.0  1.0  4.0     110.0  239.0  0.0      0.0    126.0    1.0      2.8   
2  56.0  1.0  4.0     125.0  249.0  1.0      2.0    144.0    1.0      1.2   
3  58.0  1.0  2.0     125.0  220.0  0.0      0.0    144.0    0.0      0.4   
4  61.0  1.0  4.0     120.0  260.0  0.0      0.0    140.0    1.0      3.6   
5  44.0  1.0  3.0     140.0  235.0  0.0      2.0    180.0    0.0      0.0   
6  54.0  0.0  2.0     132.0  288.0  1.0      2.0    159.0    1.0      0.0   
7  48.0  1.0  4.0     130.0  256.0  1.0      2.0    150.0    1.0      0.0   
8  48.0  1.0  4.0     124.0  274.0  0.0      2.0    166.0    0.0      0.5   
9  61.0  1.0  3.0     150.0  243.0  1.0      0.0    137.0    1.0      1.0   

   slope        ca  thal  
0    2.0  3.000000   7.0  
1    2.0  1.000000   7.0  
2    2.0  1.000000   3.0  
3    2.0

In [101]:
# Feature Scaling
print("===== Sebelum Feature Scaling =====")
print(x_train_df.head(10))

===== Sebelum Feature Scaling =====
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  58.0  1.0  4.0     128.0  216.0  0.0      2.0    131.0    1.0      2.2   
1  54.0  1.0  4.0     110.0  239.0  0.0      0.0    126.0    1.0      2.8   
2  56.0  1.0  4.0     125.0  249.0  1.0      2.0    144.0    1.0      1.2   
3  58.0  1.0  2.0     125.0  220.0  0.0      0.0    144.0    0.0      0.4   
4  61.0  1.0  4.0     120.0  260.0  0.0      0.0    140.0    1.0      3.6   
5  44.0  1.0  3.0     140.0  235.0  0.0      2.0    180.0    0.0      0.0   
6  54.0  0.0  2.0     132.0  288.0  1.0      2.0    159.0    1.0      0.0   
7  48.0  1.0  4.0     130.0  256.0  1.0      2.0    150.0    1.0      0.0   
8  48.0  1.0  4.0     124.0  274.0  0.0      2.0    166.0    0.0      0.5   
9  61.0  1.0  3.0     150.0  243.0  1.0      0.0    137.0    1.0      1.0   

   slope        ca  thal  
0    2.0  3.000000   7.0  
1    2.0  1.000000   7.0  
2    2.0  1.000000   3.0  
3    2.0

In [102]:
# Lakukan Standarisasi (StandardScaler)
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)

In [103]:
# Ubah kembali hasil scaling ke DataFrame agar tetap ada nama kolom
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=dataset.columns[:-1])
x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=dataset.columns[:-1])

In [104]:
# Tampilkan hasil scaling
print("\n===== Sesudah Feature Scaling (x_train) =====")
print(x_train_scaled_df.head(10))


===== Sesudah Feature Scaling (x_train) =====
        age       sex        cp  trestbps      chol       fbs   restecg  \
0  0.394913  0.715891  0.861173 -0.205289 -0.613153 -0.411196  1.061864   
1 -0.048049  0.715891  0.861173 -1.212316 -0.185429 -0.411196 -0.953596   
2  0.173432  0.715891  0.861173 -0.373127  0.000538  2.431930  1.061864   
3  0.394913  0.715891 -1.276302 -0.373127 -0.538766 -0.411196 -0.953596   
4  0.727135  0.715891  0.861173 -0.652857  0.205102 -0.411196 -0.953596   
5 -1.155453  0.715891 -0.207565  0.466062 -0.259816 -0.411196  1.061864   
6 -0.048049 -1.396861 -1.276302  0.018495  0.725809  2.431930  1.061864   
7 -0.712491  0.715891  0.861173 -0.093397  0.130715  2.431930  1.061864   
8 -0.712491  0.715891  0.861173 -0.429073  0.465456 -0.411196  1.061864   
9  0.727135  0.715891 -0.207565  1.025522 -0.111042  2.431930 -0.953596   

    thalach     exang   oldpeak     slope        ca      thal  
0 -0.852478  1.396861  0.979719  0.678075  2.555198  1.161003  

In [105]:
print("\n===== Sesudah Feature Scaling (x_test) =====")
print(x_test_scaled_df.head(10))


===== Sesudah Feature Scaling (x_test) =====
        age       sex        cp  trestbps      chol       fbs   restecg  \
0 -1.266193  0.715891  0.861173 -1.212316 -0.706137 -0.411196 -0.953596   
1  1.502318  0.715891 -0.207565 -0.764749  0.521246 -0.411196 -0.953596   
2  0.505654  0.715891  0.861173  0.354170  0.409665 -0.411196  1.061864   
3  1.059356  0.715891  0.861173  0.745792 -0.687540 -0.411196  1.061864   
4  0.616394 -1.396861  0.861173  1.473089  1.041953 -0.411196  1.061864   
5 -1.487674  0.715891 -1.276302 -0.652857 -1.710359 -0.411196 -0.953596   
6 -1.376934  0.715891 -0.207565 -0.093397 -1.282635 -0.411196 -0.953596   
7  1.170096  0.715891  0.861173  0.186332  0.093521 -0.411196  1.061864   
8  0.837875 -1.396861  0.861173  0.354170  0.837390  2.431930 -0.953596   
9  1.059356  0.715891  0.861173 -0.652857 -0.055252 -0.411196  1.061864   

    thalach     exang   oldpeak     slope        ca      thal  
0  0.495005 -0.715891 -0.923939 -0.996355 -0.710873  1.161003  


In [106]:
# Tampilkan ukuran data
print("\n===== Ukuran Data =====")
print("x_train:", x_train_scaled_df.shape)
print("x_test :", x_test_scaled_df.shape)
print("y_train:", y_train_df.shape)
print("y_test :", y_test_df.shape)


===== Ukuran Data =====
x_train: (242, 13)
x_test : (61, 13)
y_train: (242, 1)
y_test : (61, 1)
