# Import Required Libraries
Import necessary libraries such as NumPy, Pandas, Matplotlib, Seaborn, and Scikit-learn.

In [None]:
# Import Required Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.decomposition import PCA
from tabulate import tabulate

# Load and Preprocess Data
Load the training, validation, and test datasets. Perform scaling and feature selection as needed.

In [None]:
# Load and preprocess data
data = pd.read_csv('../clean_data/BigDS_train.csv')
X = data.drop(columns=['Price'])
y = data['Price']

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Load test data
testset = pd.read_csv('../clean_data/BigDS_test.csv')
X_test = testset.drop(columns=['Price'])
y_test = testset['Price']

# Adjust Prediction Plot
Refactor the prediction plot function to improve visualization, including better axis labels, legends, and gridlines.

In [None]:
# Adjusted prediction plot function
def plot_predictions(y_true, y_pred, title="Prediction vs Actual"):
    plt.figure(figsize=(12, 6))
    plt.plot(y_true[:250], label="Actual", color="blue")
    plt.plot(y_pred[:250], label="Predicted", color="orange")
    plt.xlabel("Sample Index")
    plt.ylabel("Price (VND)")
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# Adjust Feature Importance Chart
Enhance the feature importance chart by adding titles, axis labels, and sorting features by importance.

In [None]:
# Adjusted feature importance chart
def plot_feature_importance(model, feature_names):
    importance = model.feature_importances_
    sorted_indices = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_indices)), importance[sorted_indices], align="center")
    plt.yticks(range(len(sorted_indices)), [feature_names[i] for i in sorted_indices])
    plt.xlabel("Feature Importance")
    plt.ylabel("Features")
    plt.title("Feature Importance Chart")
    plt.grid(True)
    plt.show()

# Adjust PCA Comparison Chart
Refactor the PCA comparison chart to include clear legends, titles, and consistent color schemes.

In [None]:
# Adjusted PCA comparison chart
def plot_pca_comparison(scores, metrics=["MAE", "RMSE", "MAPE"]):
    fig, axs = plt.subplots(1, len(metrics), figsize=(15, 5))
    fig.suptitle("PCA Effect on Model Performance", fontsize=16)
    for i, metric in enumerate(metrics):
        df_comparison = pd.DataFrame({
            "Algorithms": ["Random Forest"],
            "PCA": [scores[-2][metric][0]],
            "No PCA": [scores[-1][metric][0]],
        })
        df_comparison.plot(x="Algorithms", y=["PCA", "No PCA"], kind="bar", ax=axs[i], color=["#b5ead7", "#fdcf76"])
        axs[i].set_ylabel(metric)
        axs[i].set_title(f"Comparison of {metric}")
    plt.tight_layout()
    plt.show()

# Adjust Metrics Display
Improve the tabular display of metrics using libraries like Tabulate or Pandas for better formatting.

In [None]:
# Adjusted metrics display
def display_metrics_table(metrics_dict):
    table = [[key, value] for key, value in metrics_dict.items()]
    headers = ["Metric", "Value"]
    print(tabulate(table, headers=headers, tablefmt="fancy_grid", numalign="right", floatfmt=".2f"))

# Adjust Test Data Evaluation Charts
Refactor test data evaluation plots to include better annotations, axis labels, and titles for clarity.

In [None]:
# Adjusted test data evaluation plot
def evaluate_test_data(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    metrics = {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R²": r2
    }
    display_metrics_table(metrics)
    
    plt.figure(figsize=(12, 6))
    plt.scatter(y_test, y_pred, alpha=0.6, color="orange")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="blue", linestyle="--")
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title("Test Data Evaluation")
    plt.grid(True)
    plt.show()