## Mean Imputation

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Tải dữ liệu từ file CSV
data = pd.read_csv('energy-data-filtered.csv')

# Chọn các cột quan trọng và kiểm tra missing values
features = ['electricity_generation', 'renewables_electricity', 'fossil_electricity', 'electricity_demand']
target = 'electricity_generation'

X = data[features]
y = X[target]
X = X.drop(columns=[target])

# Kiểm tra và xử lý missing values trong y
y = y.dropna()
X = X.iloc[y.index]  # Điều chỉnh X sao cho phù hợp với các chỉ số của y sau khi loại bỏ missing values

# Chia dữ liệu thành train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Phương pháp 1: Mean Imputation
mean_imputer = SimpleImputer(strategy='mean')
X_train_imputed_mean = mean_imputer.fit_transform(X_train)
X_test_imputed_mean = mean_imputer.transform(X_test)

# Các mô hình học máy
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
}

# Cross-validation cho các mô hình với Mean Imputation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_imputed_mean, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'Mean Imputation + {model_name} CV MSE: {-cv_scores.mean()}')


Mean Imputation + Linear Regression CV MSE: 163656.38005856174
Mean Imputation + Random Forest CV MSE: 42636.506010769604
Mean Imputation + Ridge Regression CV MSE: 163656.3795616895
Mean Imputation + Decision Tree CV MSE: 74809.82022592222
Mean Imputation + KNN Regressor CV MSE: 122976.06456132785
Mean Imputation + Gradient Boosting CV MSE: 42378.51678959475
Mean Imputation + XGBoost CV MSE: 59283.92540622699


## Median Imputation

In [15]:
# Phương pháp 2: Median Imputation
median_imputer = SimpleImputer(strategy='median')
X_train_imputed_median = median_imputer.fit_transform(X_train)
X_test_imputed_median = median_imputer.transform(X_test)

# Cross-validation cho các mô hình với Median Imputation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_imputed_median, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'Median Imputation + {model_name} CV MSE: {-cv_scores.mean()}')


Median Imputation + Linear Regression CV MSE: 12102.698604457242
Median Imputation + Random Forest CV MSE: 42095.8565382724
Median Imputation + Ridge Regression CV MSE: 12102.698596524442
Median Imputation + Decision Tree CV MSE: 81072.66251442223
Median Imputation + KNN Regressor CV MSE: 118100.70553262283
Median Imputation + Gradient Boosting CV MSE: 42957.83483561207
Median Imputation + XGBoost CV MSE: 54363.81588022523


## KNN Imputation

In [16]:
from sklearn.impute import KNNImputer

# Phương pháp 3: KNN Imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed_knn = knn_imputer.fit_transform(X_train)
X_test_imputed_knn = knn_imputer.transform(X_test)

# Cross-validation cho các mô hình với KNN Imputation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_imputed_knn, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'KNN Imputation + {model_name} CV MSE: {-cv_scores.mean()}')


KNN Imputation + Linear Regression CV MSE: 9703.711418175182
KNN Imputation + Random Forest CV MSE: 33756.55459255226
KNN Imputation + Ridge Regression CV MSE: 9703.711331604747
KNN Imputation + Decision Tree CV MSE: 52918.001057729634
KNN Imputation + KNN Regressor CV MSE: 51590.79981363745
KNN Imputation + Gradient Boosting CV MSE: 36150.63686945978
KNN Imputation + XGBoost CV MSE: 58492.82583035702


## Interpolation 

In [17]:
# Phương pháp 4: Interpolation
X_train_imputed_interp = X_train.interpolate(method='linear', axis=0)
X_test_imputed_interp = X_test.interpolate(method='linear', axis=0)

# Kiểm tra và xử lý NaN sau interpolation
if X_train_imputed_interp.isna().sum().sum() > 0:
    print("Có NaN trong X_train sau interpolation, điền lại bằng mean imputation.")
    X_train_imputed_interp = X_train_imputed_interp.fillna(X_train_imputed_interp.mean())

if X_test_imputed_interp.isna().sum().sum() > 0:
    print("Có NaN trong X_test sau interpolation, điền lại bằng mean imputation.")
    X_test_imputed_interp = X_test_imputed_interp.fillna(X_test_imputed_interp.mean())

# Cross-validation cho các mô hình với Interpolation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_imputed_interp, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'Interpolation + {model_name} CV MSE: {-cv_scores.mean()}')


Có NaN trong X_train sau interpolation, điền lại bằng mean imputation.
Có NaN trong X_test sau interpolation, điền lại bằng mean imputation.
Interpolation + Linear Regression CV MSE: 509822.3539393477
Interpolation + Random Forest CV MSE: 60564.62586743284
Interpolation + Ridge Regression CV MSE: 509822.3530053556
Interpolation + Decision Tree CV MSE: 98718.27669972592
Interpolation + KNN Regressor CV MSE: 1141059.7687162766
Interpolation + Gradient Boosting CV MSE: 68638.96675235454
Interpolation + XGBoost CV MSE: 82979.16800374315


## Drop Missing Values

In [18]:
# Phương pháp 5: Drop Missing Values
X_train_dropped = X_train.dropna()
y_train_dropped = y_train.loc[X_train_dropped.index]  # Đảm bảo rằng y_train có các chỉ số tương ứng
X_test_dropped = X_test.dropna()
y_test_dropped = y_test.loc[X_test_dropped.index]  # Đảm bảo rằng y_test có các chỉ số tương ứng

# Cross-validation cho các mô hình với Drop Missing Values
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train_dropped, y_train_dropped, cv=5, scoring='neg_mean_squared_error')
    print(f'Drop Missing Values + {model_name} CV MSE: {-cv_scores.mean()}')


Drop Missing Values + Linear Regression CV MSE: 207.21992945689598
Drop Missing Values + Random Forest CV MSE: 37509.969294779185
Drop Missing Values + Ridge Regression CV MSE: 207.21995774667826
Drop Missing Values + Decision Tree CV MSE: 84819.71152520916
Drop Missing Values + KNN Regressor CV MSE: 107992.43810111037
Drop Missing Values + Gradient Boosting CV MSE: 46402.59221620422
Drop Missing Values + XGBoost CV MSE: 97340.52664382555


In [19]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
}

# Các phương pháp Imputation đã có sẵn (các biến đã được tính toán trước)
imputation_methods = {
    "Mean Imputation": X_train_imputed_mean,
    "Median Imputation": X_train_imputed_median,
    "KNN Imputation": X_train_imputed_knn,
    "Interpolation": X_train_imputed_interp,
    "Drop Missing Values": X_train_dropped
}

# Tạo dictionary để lưu kết quả MSE của mỗi phương pháp và mô hình
results = {}

# Cross-validation cho các mô hình với các phương pháp khác nhau
for imputer_name, X_train_imputed in imputation_methods.items():
    model_scores = []
    for model_name, model in models.items():
        if imputer_name == "Drop Missing Values":  # Dữ liệu đã được loại bỏ missing values
            X_train_imputed_model = X_train_dropped
            y_train_imputed_model = y_train_dropped
        else:
            X_train_imputed_model = X_train_imputed
            y_train_imputed_model = y_train
        
        cv_scores = cross_val_score(model, X_train_imputed_model, y_train_imputed_model, cv=5, scoring='neg_mean_squared_error')
        model_scores.append((model_name, -cv_scores.mean()))  # Lưu MSE cho mô hình

    results[imputer_name] = model_scores

# In ra phương pháp và mô hình tốt nhất
best_imputation = None
best_model_name = None
best_mse = float('inf')

for imputer_name, model_scores in results.items():
    for model_name, mse in model_scores:
        if mse < best_mse:
            best_imputation = imputer_name
            best_model_name = model_name
            best_mse = mse

print(f"Best method: {best_imputation} with model {best_model_name} with MSE = {best_mse}")


Best method: Drop Missing Values with model Linear Regression with MSE = 207.21992945689598
