In [126]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [127]:
california_housing = fetch_california_housing(as_frame=True)
cali_df = california_housing.frame

In [128]:
cali_df.columns = cali_df.columns.str.replace(' ', '')

In [129]:
cali_df['MedInc'].max()

np.float64(15.0001)

In [130]:
cali_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [131]:
X = cali_df.drop('MedHouseVal', axis=1)
y = cali_df['MedHouseVal']

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [133]:
labels = ['low_income', 'medium_income', 'high_income', 'very_high_income']
bins = [0, 2.5, 5.0, 7.5, np.inf]
X_train['income_categorical'] = pd.cut(X_train['MedInc'], bins=bins, labels=labels, right=False)
X_test['income_categorical'] = pd.cut(X_test['MedInc'], bins=bins, labels=labels, right=False)

In [134]:
X_train[['MedInc', 'income_categorical']].head()

Unnamed: 0,MedInc,income_categorical
14196,3.2596,medium_income
8267,3.8125,medium_income
17445,4.1563,medium_income
14265,1.9425,low_income
2271,3.5542,medium_income


### One Hot Encoding

In [135]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [136]:
# fit_transform(), transform()
X_train_cat = encoder.fit_transform(X_train[['income_categorical']])
X_test_cat = encoder.transform(X_test[['income_categorical']])

In [137]:
X_train_cat

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [138]:
feature_name = encoder.get_feature_names_out(['income_categorical'])
X_train_cat_df = pd.DataFrame(X_train_cat, columns=feature_name, index=X_train.index)
X_test_cat_df = pd.DataFrame(X_test_cat, columns=feature_name, index=X_test.index)

# encoded_feature_names = encoder.get_feature_names_out(['income_category'])

In [139]:
X_train_cat_df

Unnamed: 0,income_categorical_high_income,income_categorical_low_income,income_categorical_medium_income,income_categorical_very_high_income
14196,0.0,0.0,1.0,0.0
8267,0.0,0.0,1.0,0.0
17445,0.0,0.0,1.0,0.0
14265,0.0,1.0,0.0,0.0
2271,0.0,0.0,1.0,0.0
...,...,...,...,...
11284,1.0,0.0,0.0,0.0
11964,0.0,0.0,1.0,0.0
5390,0.0,0.0,1.0,0.0
860,1.0,0.0,0.0,0.0


### Polynomial Feature HouseAge and Population

In [140]:
poly = PolynomialFeatures(degree=2, include_bias=False)

In [141]:
X_train_poly = poly.fit_transform(X_train[['HouseAge', 'Population']])
X_test_poly = poly.transform(X_test[['HouseAge', 'Population']])

In [142]:
poly_feature_names = poly.get_feature_names_out(['HouseAge', 'Population'])
X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly_feature_names, index=X_train.index)
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_feature_names, index=X_test.index)

In [143]:
X_train_poly_df

Unnamed: 0,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
14196,33.0,2300.0,1089.0,75900.0,5290000.0
8267,49.0,1314.0,2401.0,64386.0,1726596.0
17445,4.0,915.0,16.0,3660.0,837225.0
14265,36.0,1418.0,1296.0,51048.0,2010724.0
2271,43.0,874.0,1849.0,37582.0,763876.0
...,...,...,...,...,...
11284,35.0,658.0,1225.0,23030.0,432964.0
11964,33.0,1753.0,1089.0,57849.0,3073009.0
5390,36.0,1756.0,1296.0,63216.0,3083536.0
860,15.0,1777.0,225.0,26655.0,3157729.0


In [144]:
X_train_cleaned = X_train.drop(columns=['income_categorical', 'HouseAge', 'Population'])
X_test_cleaned = X_test.drop(columns=['income_categorical', 'HouseAge', 'Population'])

In [145]:
X_train_engineered = pd.concat([X_train_cleaned, X_train_cat_df, X_train_poly_df], axis=1)
X_test_engineered = pd.concat([X_test_cleaned, X_test_cat_df, X_test_poly_df], axis=1)

In [146]:
X_train_engineered

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_categorical_high_income,income_categorical_low_income,income_categorical_medium_income,income_categorical_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
14196,3.2596,5.017657,1.006421,3.691814,32.71,-117.03,0.0,0.0,1.0,0.0,33.0,2300.0,1089.0,75900.0,5290000.0
8267,3.8125,4.473545,1.041005,1.738095,33.77,-118.16,0.0,0.0,1.0,0.0,49.0,1314.0,2401.0,64386.0,1726596.0
17445,4.1563,5.645833,0.985119,2.723214,34.66,-120.48,0.0,0.0,1.0,0.0,4.0,915.0,16.0,3660.0,837225.0
14265,1.9425,4.002817,1.033803,3.994366,32.69,-117.11,0.0,1.0,0.0,0.0,36.0,1418.0,1296.0,51048.0,2010724.0
2271,3.5542,6.268421,1.134211,2.300000,36.78,-119.80,0.0,0.0,1.0,0.0,43.0,874.0,1849.0,37582.0,763876.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,6.3700,6.129032,0.926267,3.032258,33.78,-117.96,1.0,0.0,0.0,0.0,35.0,658.0,1225.0,23030.0,432964.0
11964,3.0500,6.868597,1.269488,3.904232,34.02,-117.43,0.0,0.0,1.0,0.0,33.0,1753.0,1089.0,57849.0,3073009.0
5390,2.9344,3.986717,1.079696,3.332068,34.03,-118.38,0.0,0.0,1.0,0.0,36.0,1756.0,1296.0,63216.0,3083536.0
860,5.7192,6.395349,1.067979,3.178891,37.58,-121.96,1.0,0.0,0.0,0.0,15.0,1777.0,225.0,26655.0,3157729.0


### Feature Scaling

In [147]:
scaler_standard = StandardScaler()
X_train_scaled_standard = scaler_standard.fit_transform(X_train_engineered)
X_test_scaled_standard = scaler_standard.transform(X_test_engineered)

In [148]:
feature_names = scaler_standard.get_feature_names_out()
X_train_scaled_standard_df = pd.DataFrame(X_train_scaled_standard, columns=feature_names, index=X_train.index)
X_test_scaled_standard_df = pd.DataFrame(X_test_scaled_standard, columns=feature_names, index=X_test.index)

In [151]:
X_train_scaled_standard_df.describe()

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_categorical_high_income,income_categorical_low_income,income_categorical_medium_income,income_categorical_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-6.519333000000001e-17,-1.981081e-16,-1.707291e-16,4.9365660000000006e-17,6.400995e-17,1.753335e-15,6.562365e-18,-3.61468e-17,9.337922e-17,-3.571648e-17,-9.251859000000001e-18,-2.1515949999999998e-19,-1.9471930000000003e-17,-3.378004e-17,1.180688e-17
std,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003
min,-1.775438,-1.904386,-1.762117,-0.2076846,-1.447697,-2.377207,-0.4608946,-0.5479597,-1.100201,-0.2202592,-2.190766,-1.251913,-1.29918,-1.442046,-0.2349694
25%,-0.6900689,-0.4118373,-0.2081645,-0.05770769,-0.8018107,-1.110749,-0.4608946,-0.5479597,-1.100201,-0.2202592,-0.8417859,-0.5606339,-0.8693349,-0.6756449,-0.1910121
50%,-0.1758995,-0.08350905,-0.109416,-0.02415892,-0.6473597,0.5346501,-0.4608946,-0.5479597,0.908925,-0.2202592,0.03108328,-0.2281865,-0.1813166,-0.1883631,-0.1388033
75%,0.4686502,0.2621376,0.008455177,0.01580865,0.9720351,0.783953,-0.4608946,-0.5479597,0.908925,-0.2202592,0.6658972,0.2634487,0.5213405,0.453718,-0.02460932
max,5.839268,57.16655,56.64727,107.1164,2.951816,2.628794,2.169694,1.824952,0.908925,4.540106,1.856173,30.12743,2.297945,18.24128,89.66953


In [None]:
model_raw = LinearRegression()
# Kita harus mengambil data numerik saja, karena model tidak bisa memproses data kategorikal
X_train_raw = X_train.select_dtypes(include=np.number)
X_test_raw = X_test.select_dtypes(include=np.number)
model_raw.fit(X_train_raw, y_train)
y_pred_raw = model_raw.predict(X_test_raw)
print("Metrik Model dengan Data Mentah:")
print(f"MSE: {mean_squared_error(y_test, y_pred_raw):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_raw) / (y_pred_raw.max() - y_pred_raw.min())):.2f}")

Metrik Model dengan Data Mentah:
MSE: 0.56
NMSE: 0.04


In [None]:
model_eng = LinearRegression()
model_eng.fit(X_train_engineered, y_train)
y_pred_fe = model_eng.predict(X_test_engineered)
print("Metrik Model dengan Data Feture Engineering:")
print(f"MSE: {mean_squared_error(y_test, y_pred_fe):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_fe) / (y_pred_fe.max() - y_pred_fe.min())):.2f}")


Metrik Model dengan Data Feture Engineering:
MSE: 0.55
NMSE: 0.04


In [156]:
model_scaling = LinearRegression()
model_scaling.fit(X_train_scaled_standard, y_train)
y_pred_std = model_scaling.predict(X_test_scaled_standard)
print("Metrik Model dengan Data Feture Engineering:")
print(f"MSE: {mean_squared_error(y_test, y_pred_std):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_std) / (y_pred_std.max() - y_pred_std.min())):.2f}")


Metrik Model dengan Data Feture Engineering:
MSE: 0.55
NMSE: 0.04


In [5]:

california_housing = fetch_california_housing(as_frame=True)
cali_df = california_housing.frame
cali_df.columns = cali_df.columns.str.replace(' ', '') # Menghapus spasi di nama kolom


In [6]:
cali_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [7]:
cali_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [55]:
cali_df['MedInc'].min()

np.float64(0.4999)

In [8]:
X = cali_df.drop('MedHouseVal', axis=1)
y = cali_df['MedHouseVal']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Fitur Categorical Buatan dengan Binning

In [10]:
# Kita akan membagi Median Income menjadi 4 kategori (bins)
bins = [0, 2.5, 5, 7.5, np.inf]
labels = ['low_income', 'medium_income', 'high_income', 'very_high_income']
X_train['income_category'] = pd.cut(X_train['MedInc'], bins=bins, labels=labels, right=False)
X_test['income_category'] = pd.cut(X_test['MedInc'], bins=bins, labels=labels, right=False)

In [56]:
print(X_train[['MedInc', 'income_category']].sample())

     MedInc income_category
829  3.7931   medium_income


### One hot encoding

In [14]:
# kategori tadi kita one hot encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit dan transform encoder pada data latih
X_train_cat = encoder.fit_transform(X_train[['income_category']])
# Transform hanya pada data uji (jangan fit ulang)
X_test_cat = encoder.transform(X_test[['income_category']])

In [16]:
X_test_cat

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [None]:
# Buat DataFrame dari hasil encoding
encoded_feature_names = encoder.get_feature_names_out(['income_category'])
X_train_cat_df = pd.DataFrame(X_train_cat, columns=encoded_feature_names, index=X_train.index)
X_test_cat_df = pd.DataFrame(X_test_cat, columns=encoded_feature_names, index=X_test.index)


In [22]:
X_train_cat_df.head()

Unnamed: 0,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income
14196,0.0,0.0,1.0,0.0
8267,0.0,0.0,1.0,0.0
17445,0.0,0.0,1.0,0.0
14265,0.0,1.0,0.0,0.0
2271,0.0,0.0,1.0,0.0


In [None]:
# Polynomial Features untuk fitur numerik
# Pilih fitur numerik yang relevan untuk dikombinasikan (misal: HouseAge dan Population)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train[['HouseAge', 'Population']])
X_test_poly = poly.transform(X_test[['HouseAge', 'Population']])

In [24]:
poly_feature_names = poly.get_feature_names_out(['HouseAge', 'Population'])
X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly_feature_names, index=X_train.index)
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_feature_names, index=X_test.index)


In [25]:
print(X_train_poly_df.head())

       HouseAge  Population  HouseAge^2  HouseAge Population  Population^2
14196      33.0      2300.0      1089.0              75900.0     5290000.0
8267       49.0      1314.0      2401.0              64386.0     1726596.0
17445       4.0       915.0        16.0               3660.0      837225.0
14265      36.0      1418.0      1296.0              51048.0     2010724.0
2271       43.0       874.0      1849.0              37582.0      763876.0


In [26]:
X_train_cleaned = X_train.drop(columns=['income_category', 'HouseAge', 'Population'])
X_test_cleaned = X_test.drop(columns=['income_category', 'HouseAge', 'Population'])

In [27]:
# Gabungkan semua fitur yang sudah di-engineer
# Hapus fitur yang sudah diolah dan fitur buatan
X_train_cleaned = X_train.drop(columns=['income_category', 'HouseAge', 'Population'])
X_test_cleaned = X_test.drop(columns=['income_category', 'HouseAge', 'Population'])


In [28]:
X_train_engineered = pd.concat([X_train_cleaned, X_train_cat_df, X_train_poly_df], axis=1)
X_test_engineered = pd.concat([X_test_cleaned, X_test_cat_df, X_test_poly_df], axis=1)

In [29]:
X_train_engineered.head()

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
14196,3.2596,5.017657,1.006421,3.691814,32.71,-117.03,0.0,0.0,1.0,0.0,33.0,2300.0,1089.0,75900.0,5290000.0
8267,3.8125,4.473545,1.041005,1.738095,33.77,-118.16,0.0,0.0,1.0,0.0,49.0,1314.0,2401.0,64386.0,1726596.0
17445,4.1563,5.645833,0.985119,2.723214,34.66,-120.48,0.0,0.0,1.0,0.0,4.0,915.0,16.0,3660.0,837225.0
14265,1.9425,4.002817,1.033803,3.994366,32.69,-117.11,0.0,1.0,0.0,0.0,36.0,1418.0,1296.0,51048.0,2010724.0
2271,3.5542,6.268421,1.134211,2.3,36.78,-119.8,0.0,0.0,1.0,0.0,43.0,874.0,1849.0,37582.0,763876.0


In [30]:
X_test_engineered.head()

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
20046,1.6812,4.192201,1.022284,3.877437,36.06,-119.01,0.0,1.0,0.0,0.0,25.0,1392.0,625.0,34800.0,1937664.0
3024,2.5313,5.039384,1.193493,2.679795,35.14,-119.46,0.0,0.0,1.0,0.0,30.0,1565.0,900.0,46950.0,2449225.0
15663,3.4801,3.977155,1.185877,1.360332,37.8,-122.44,0.0,0.0,1.0,0.0,52.0,1310.0,2704.0,68120.0,1716100.0
20484,5.7376,6.163636,1.020202,3.444444,34.28,-118.72,1.0,0.0,0.0,0.0,17.0,1705.0,289.0,28985.0,2907025.0
9814,3.725,5.492991,1.028037,2.483645,36.62,-121.93,0.0,0.0,1.0,0.0,34.0,1063.0,1156.0,36142.0,1129969.0


In [31]:
# a) StandardScaler
scaler_standard = StandardScaler()
X_train_scaled_standard = scaler_standard.fit_transform(X_train_engineered)
X_test_scaled_standard = scaler_standard.transform(X_test_engineered)

In [32]:
X_train_scaled_standard_df = pd.DataFrame(X_train_scaled_standard, columns=X_train_engineered.columns)

In [36]:
X_train_scaled_standard_df.describe()

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-6.519333000000001e-17,-1.981081e-16,-1.707291e-16,4.9365660000000006e-17,6.400995e-17,1.753335e-15,6.562365e-18,-3.61468e-17,9.337922e-17,-3.571648e-17,-9.251859000000001e-18,-2.1515949999999998e-19,-1.9471930000000003e-17,-3.378004e-17,1.180688e-17
std,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003,1.00003
min,-1.775438,-1.904386,-1.762117,-0.2076846,-1.447697,-2.377207,-0.4608946,-0.5479597,-1.100201,-0.2202592,-2.190766,-1.251913,-1.29918,-1.442046,-0.2349694
25%,-0.6900689,-0.4118373,-0.2081645,-0.05770769,-0.8018107,-1.110749,-0.4608946,-0.5479597,-1.100201,-0.2202592,-0.8417859,-0.5606339,-0.8693349,-0.6756449,-0.1910121
50%,-0.1758995,-0.08350905,-0.109416,-0.02415892,-0.6473597,0.5346501,-0.4608946,-0.5479597,0.908925,-0.2202592,0.03108328,-0.2281865,-0.1813166,-0.1883631,-0.1388033
75%,0.4686502,0.2621376,0.008455177,0.01580865,0.9720351,0.783953,-0.4608946,-0.5479597,0.908925,-0.2202592,0.6658972,0.2634487,0.5213405,0.453718,-0.02460932
max,5.839268,57.16655,56.64727,107.1164,2.951816,2.628794,2.169694,1.824952,0.908925,4.540106,1.856173,30.12743,2.297945,18.24128,89.66953


In [38]:
scaler_minmax = MinMaxScaler()
X_train_scaled_minmax = scaler_minmax.fit_transform(X_train_engineered)
X_test_scaled_minmax = scaler_minmax.transform(X_test_engineered)

X_train_scaled_minmax_df = pd.DataFrame(X_train_scaled_minmax, columns=X_train_engineered.columns)


In [39]:

X_train_scaled_minmax_df.head()


Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
0,0.190322,0.029278,0.026601,0.002414,0.017021,0.729084,0.0,0.0,1.0,0.0,0.627451,0.06438,0.402516,0.151897,0.004155
1,0.228452,0.025419,0.027968,0.000842,0.129787,0.616534,0.0,0.0,1.0,0.0,0.941176,0.036744,0.887902,0.128847,0.001356
2,0.252162,0.033732,0.025759,0.001634,0.224468,0.385458,0.0,0.0,1.0,0.0,0.058824,0.025561,0.005549,0.007279,0.000658
3,0.099488,0.022081,0.027683,0.002657,0.014894,0.721116,0.0,1.0,0.0,0.0,0.686275,0.039659,0.479097,0.102145,0.001579
4,0.210638,0.038147,0.031651,0.001294,0.45,0.453187,0.0,0.0,1.0,0.0,0.823529,0.024412,0.683685,0.075188,0.0006


In [40]:
X_train_scaled_minmax_df.describe()

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,income_category_high_income,income_category_low_income,income_category_medium_income,income_category_very_high_income,HouseAge,Population,HouseAge^2,HouseAge Population,Population^2
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.233159,0.032239,0.030168,0.001935,0.329058,0.474871,0.175206,0.230923,0.547602,0.046269,0.541339,0.039896,0.361172,0.073262,0.002614
std,0.131329,0.016929,0.017121,0.009318,0.227305,0.199766,0.380155,0.421436,0.497744,0.210074,0.247108,0.031869,0.278008,0.050806,0.011123
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.142536,0.025267,0.026604,0.001397,0.146809,0.252988,0.0,0.0,0.0,0.0,0.333333,0.02203,0.119497,0.038937,0.000489
50%,0.210059,0.030825,0.028295,0.00171,0.181915,0.581673,0.0,0.0,1.0,0.0,0.54902,0.032624,0.310766,0.063693,0.00107
75%,0.294705,0.036677,0.030313,0.002082,0.55,0.631474,0.0,0.0,1.0,0.0,0.705882,0.048292,0.506104,0.096313,0.00234
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
from sklearn.metrics import r2_score

In [48]:
from sklearn.metrics import r2_score


# Model 1: Menggunakan data MENTAH
model_raw = LinearRegression()
# Kita harus mengambil data numerik saja, karena model tidak bisa memproses data kategorikal
X_train_raw = X_train.select_dtypes(include=np.number)
X_test_raw = X_test.select_dtypes(include=np.number)
model_raw.fit(X_train_raw, y_train)
y_pred_raw = model_raw.predict(X_test_raw)
print("Metrik Model dengan Data Mentah:")
print(f"MSE: {mean_squared_error(y_test, y_pred_raw):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_raw) / (y_pred_raw.max() - y_pred_raw.min())):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred_raw):.2f}")

Metrik Model dengan Data Mentah:
MSE: 0.56
NMSE: 0.04
R2 Score: 0.58


In [57]:
# Model 2: Menggunakan data setelah FEATURE ENGINEERING (tanpa scaling)
model_fe = LinearRegression()
model_fe.fit(X_train_engineered, y_train)
y_pred_fe = model_fe.predict(X_test_engineered)
print("Metrik Model dengan Data Feature Engineering:")
print(f"MSE: {mean_squared_error(y_test, y_pred_fe):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_fe) / (y_pred_fe.max() - y_pred_fe.min())):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred_fe):.2f}")

Metrik Model dengan Data Feature Engineering:
MSE: 0.55
NMSE: 0.04
R2 Score: 0.58


In [50]:
# Model 3: Menggunakan data setelah FEATURE ENGINEERING + FEATURE SCALING (StandardScaler)
model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled_standard, y_train)
y_pred_scaled = model_scaled.predict(X_test_scaled_standard)
print("Metrik Model dengan Data setelah Feature Engineering & Scaling:")
print(f"MSE: {mean_squared_error(y_test, y_pred_scaled):.2f}")
print(f"NMSE: {(mean_squared_error(y_test, y_pred_fe) / (y_pred_scaled.max() - y_pred_scaled.min())):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred_scaled):.2f}")

Metrik Model dengan Data setelah Feature Engineering & Scaling:
MSE: 0.55
NMSE: 0.04
R2 Score: 0.58
