# Linear Regression - Cross Validation and Performance Tuning

## Kode Tahapan Sebelumnya

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Generate data random untuk contoh dataset
np.random.seed(42)
land_area = np.random.randint(100, 500, size=200)
house_price = 150000 + 300 * land_area + np.random.randint(-50000, 50000, size=200)

data = pd.DataFrame({'Luas_Tanah': land_area, 'Harga_Rumah': house_price})

# Pisahkan variabel dependen (Y) dan independen (X)
X = data[['Luas_Tanah']]
y = data['Harga_Rumah']

# Pisahkan data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisasi data menggunakan StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X Train", X_train)

print("Y Train", y_train)

from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error

# Contoh Feature Selection dengan Univariate Feature Selection
selector = SelectKBest(score_func=f_regression, k="all")
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Contoh Regularization dengan Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_selected, y_train)
lasso_mse = mean_squared_error(y_test, lasso_model.predict(X_test_selected))

# Contoh Regularization dengan Ridge Regression
ridge_model = Ridge(alpha=0.01)
ridge_model.fit(X_train_selected, y_train)
ridge_mse = mean_squared_error(y_test, ridge_model.predict(X_test_selected))

print("Lasso Regression MSE:", lasso_mse)
print("Ridge Regression MSE:", ridge_mse)


X Train      Luas_Tanah
79          301
197         458
38          150
24          376
122         300
..          ...
106         108
14          459
92          466
179         127
102         400

[160 rows x 1 columns]
Y Train 79     212218
197    273177
38     197662
24     239536
122    240108
        ...  
106    145274
14     277053
92     269655
179    204335
102    226295
Name: Harga_Rumah, Length: 160, dtype: int32
Lasso Regression MSE: 833819407.7700809
Ridge Regression MSE: 833813011.9268351


## Cross Validation

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Buat objek model Regresi Linear
model = LinearRegression()

# Lakukan K-Fold Cross Validation dengan k=5
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')

# Menghitung rata-rata Mean Squared Error dari hasil Cross Validation
mse_cv = -scores.mean()

print("Average MSE from Cross Validation:", mse_cv)


Average MSE from Cross Validation: 868869330.0939802


## Performance Tuning

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge

# Buat objek model Regresi Linear dengan Lasso Regularization
lasso_model = Lasso()

# Tentukan daftar nilai alpha yang akan dicoba pada Grid Search
param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0]}

# Lakukan Grid Search dengan K-Fold Cross Validation dengan k=5
grid_search = GridSearchCV(lasso_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_selected, y_train)

# Parameter alpha terbaik
best_alpha = grid_search.best_params_['alpha']

print("Best alpha:", best_alpha)


Best alpha: 0.01
