In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer 

# 1. Membaca dan Memisahkan Dataset
dataset = pd.read_csv('Data.csv', sep=';', index_col=0) 

# X: Fitur (semua baris, semua kolom kecuali yang terakhir)
# Sekarang, X memiliki 3 kolom: Country(0), Age(1), Salary(2)
X = dataset.iloc[:, :-1].values 
# y: Target (semua baris, kolom terakhir)
y = dataset.iloc[:, -1].values 

print("X sebelum Imputasi:\n", X)

# 2. Menghilangkan Missing Value (nan)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Melatih Imputer hanya pada kolom Indeks 1 (Usia) dan Indeks 2 (Gaji)
# Rentang [:, 1:3] berarti kolom 1 dan 2.
imputer.fit(X[:, 1:3])

# Menerapkan transformasi pada kolom 1 dan 2
X[:, 1:3] = imputer.transform(X[:, 1:3])

print("\n" + "-"*30)
print("X setelah Imputasi:\n", X)

X sebelum Imputasi:
 [['France' 47.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 61000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 670000.0]
 ['France' 45.0 72000.0]
 ['Spain' 44.0 48000.0]
 ['Germany' 36.0 61000.0]
 ['Spain' 33.0 61000.0]
 ['Germany' nan nan]
 ['France' 41.0 58000.0]
 ['Spain' 42.0 52000.0]
 ['France' 43.0 79000.0]
 ['Germany' 34.0 83000.0]
 ['France' 32.0 670000.0]]

------------------------------
X setelah Imputasi:
 [['France' 47.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 61000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 131555.55555555556]
 ['France' 35.0 58000.0]
 ['Spain' 39.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 670000.0]
 ['France' 45.0 72000.0]
 ['Spain' 44.0 48000.0]
 ['Germany' 36.0 61000.0]
 ['Spain' 33.0 61000.0]
 ['Germany' 39.0 131555.55555555556]
 ['France' 41.0 58000.0]
 ['Spain' 

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler 

# ColumnTransformer diterapkan pada X, mengubah kolom Indeks 0 (Country)
# menjadi One-Hot-Encoded, dan meninggalkan kolom lainnya (Age, Salary)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])], 
    remainder='passthrough'
)

X = np.array(ct.fit_transform(X)) 

print("\nX setelah One-Hot Encoding:\n", X)


X setelah One-Hot Encoding:
 [[1.0 0.0 0.0 47.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 61000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 131555.55555555556]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 39.0 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 670000.0]
 [1.0 0.0 0.0 45.0 72000.0]
 [0.0 0.0 1.0 44.0 48000.0]
 [0.0 1.0 0.0 36.0 61000.0]
 [0.0 0.0 1.0 33.0 61000.0]
 [0.0 1.0 0.0 39.0 131555.55555555556]
 [1.0 0.0 0.0 41.0 58000.0]
 [0.0 0.0 1.0 42.0 52000.0]
 [1.0 0.0 0.0 43.0 79000.0]
 [0.0 1.0 0.0 34.0 83000.0]
 [1.0 0.0 0.0 32.0 670000.0]]


In [53]:
from sklearn.preprocessing import LabelEncoder

# Inisialisasi LabelEncoder
le = LabelEncoder()

# Terapkan LabelEncoder pada vektor y
y = le.fit_transform(y)

print("\ny setelah Label Encoding:\n", y)


y setelah Label Encoding:
 [0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 1]


In [55]:

# 1. Membaca Dataset
dataset = pd.read_csv('Data.csv', sep=';', index_col=0) 

# Memisahkan Fitur (X) dan Target (y)
X = dataset.iloc[:, :-1].values 
y = dataset.iloc[:, -1].values 

# 2. Menghilangkan Missing Value (Imputasi)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])

# 3. Encoding Data Kategorikal (X) - Country
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])], 
    remainder='passthrough'
)
X = np.array(ct.fit_transform(X)) 

# 4. Encoding Data Kategorikal 
le = LabelEncoder()
y = le.fit_transform(y)

# 5. Membagi Dataset (Splitting)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# 6. Mencetak Hasil Pembagian
print("--- Hasil Pembagian Dataset ---")
print("\nX_train (Fitur untuk Training):\n", X_train)
print("\n" + "-"*30)
print("\nX_test (Fitur untuk Testing):\n", X_test)
print("\n" + "-"*30)
print("\ny_train (Target untuk Training):\n", y_train)
print("\n" + "-"*30)
print("\ny_test (Target untuk Testing):\n", y_test)

--- Hasil Pembagian Dataset ---

X_train (Fitur untuk Training):
 [[0.0 1.0 0.0 30.0 61000.0]
 [0.0 1.0 0.0 39.0 131555.55555555556]
 [0.0 1.0 0.0 40.0 131555.55555555556]
 [1.0 0.0 0.0 43.0 79000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 33.0 61000.0]
 [1.0 0.0 0.0 47.0 72000.0]
 [1.0 0.0 0.0 32.0 670000.0]
 [0.0 1.0 0.0 34.0 83000.0]
 [1.0 0.0 0.0 37.0 670000.0]
 [1.0 0.0 0.0 41.0 58000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 36.0 61000.0]
 [0.0 0.0 1.0 44.0 48000.0]
 [1.0 0.0 0.0 35.0 58000.0]]

------------------------------

X_test (Fitur untuk Testing):
 [[0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 42.0 52000.0]
 [0.0 0.0 1.0 39.0 52000.0]
 [1.0 0.0 0.0 45.0 72000.0]]

------------------------------

y_train (Target untuk Training):
 [0 1 1 1 1 1 0 0 1 0 1 1 0 0 1 1]

------------------------------

y_test (Target untuk Testing):
 [0 0 0 0]


In [57]:
# 6. Feature Scaling
sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

# 7. Mencetak Hasil Scaling
print("--- Hasil Feature Scaling ---")
print("\nX_train setelah Scaling:\n", X_train)
print("\n" + "-"*30)
print("\nX_test setelah Scaling:\n", X_test)

--- Hasil Feature Scaling ---

X_train setelah Scaling:
 [[0.0 1.0 0.0 -1.3115784746777812 -0.44733052338482543]
 [0.0 1.0 0.0 0.07715167498104596 -0.09123257481553709]
 [0.0 1.0 0.0 0.23145502494313785 -0.09123257481553709]
 [1.0 0.0 0.0 0.6943650748294136 -0.35648348768683374]
 [1.0 0.0 0.0 1.465881824639873 -0.35648348768683374]
 [0.0 0.0 1.0 -1.774488524564057 -0.5129422713889306]
 [0.0 0.0 1.0 -0.8486684247915055 -0.44733052338482543]
 [1.0 0.0 0.0 1.3115784746777812 -0.39181289045827494]
 [1.0 0.0 0.0 -1.0029717747535973 2.626327517730559]
 [0.0 1.0 0.0 -0.6943650748294136 -0.3362952575317245]
 [1.0 0.0 0.0 -0.23145502494313785 2.626327517730559]
 [1.0 0.0 0.0 0.38575837490522974 -0.4624716960011574]
 [0.0 1.0 0.0 1.774488524564057 -0.3362952575317245]
 [0.0 1.0 0.0 -0.38575837490522974 -0.44733052338482543]
 [0.0 0.0 1.0 0.8486684247915055 -0.5129422713889306]
 [1.0 0.0 0.0 -0.5400617248673216 -0.4624716960011574]]

------------------------------

X_test setelah Scaling:
 [[0.0 