In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import load_boston

# 2. Load Dataset (Example dataset: Boston Housing Data)
data = load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Price'] = data.target

# Display the first few rows of the dataset
df.head()

# 3. Data Preprocessing
# Handle missing values (if any)
df.fillna(df.mean(), inplace=True)

# Feature Scaling using StandardScaler
scaler = StandardScaler()
X = df.drop('Price', axis=1)
y = df['Price']
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 4. Individual Models: Linear Regression, Decision Tree, Random Forest, KNN
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}

# Train and evaluate each model
model_metrics = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_metrics[model_name] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R²': r2}

# Display performance metrics of individual models
model_metrics

# 5. Bagging Ensemble Method: Random Forest as Bagging Model
bagging_model = RandomForestRegressor(random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

# Evaluate Bagging model
bagging_metrics = {
    'MSE': mean_squared_error(y_test, y_pred_bagging),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_bagging)),
    'MAE': mean_absolute_error(y_test, y_pred_bagging),
    'R²': r2_score(y_test, y_pred_bagging)
}

bagging_metrics

# 6. Boosting Ensemble Method: AdaBoost, GradientBoosting
boosting_models = {
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

boosting_metrics = {}
for model_name, model in boosting_models.items():
    model.fit(X_train, y_train)
    y_pred_boosting = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred_boosting)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred_boosting)
    r2 = r2_score(y_test, y_pred_boosting)
    
    boosting_metrics[model_name] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R²': r2}

boosting_metrics

# 7. Voting Ensemble Method: Hard and Soft Voting
hard_voting_model = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42))
])

soft_voting_model = VotingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42))
], weights=[1, 1, 1])

# Train and evaluate the models
hard_voting_model.fit(X_train, y_train)
soft_voting_model.fit(X_train, y_train)

y_pred_hard_voting = hard_voting_model.predict(X_test)
y_pred_soft_voting = soft_voting_model.predict(X_test)

# Evaluate both models
hard_voting_metrics = {
    'MSE': mean_squared_error(y_test, y_pred_hard_voting),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_hard_voting)),
    'MAE': mean_absolute_error(y_test, y_pred_hard_voting),
    'R²': r2_score(y_test, y_pred_hard_voting)
}

soft_voting_metrics = {
    'MSE': mean_squared_error(y_test, y_pred_soft_voting),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_soft_voting)),
    'MAE': mean_absolute_error(y_test, y_pred_soft_voting),
    'R²': r2_score(y_test, y_pred_soft_voting)
}

hard_voting_metrics, soft_voting_metrics

# 8. Stacking Ensemble Method
stacking_model = StackingRegressor(estimators=[
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42))
], final_estimator=LinearRegression())

stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)

# Evaluate Stacking model
stacking_metrics = {
    'MSE': mean_squared_error(y_test, y_pred_stacking),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_stacking)),
    'MAE': mean_absolute_error(y_test, y_pred_stacking),
    'R²': r2_score(y_test, y_pred_stacking)
}

stacking_metrics

# 9. Performance Comparison Table
metrics_summary = {
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN', 'Bagging', 'AdaBoost', 
              'Gradient Boosting', 'Hard Voting', 'Soft Voting', 'Stacking'],
    'MSE': [model_metrics['Linear Regression']['MSE'], model_metrics['Decision Tree']['MSE'], 
            model_metrics['Random Forest']['MSE'], model_metrics['KNN']['MSE'], 
            bagging_metrics['MSE'], boosting_metrics['AdaBoost']['MSE'], 
            boosting_metrics['Gradient Boosting']['MSE'], hard_voting_metrics['MSE'],
            soft_voting_metrics['MSE'], stacking_metrics['MSE']],
    'RMSE': [model_metrics['Linear Regression']['RMSE'], model_metrics['Decision Tree']['RMSE'],
             model_metrics['Random Forest']['RMSE'], model_metrics['KNN']['RMSE'], 
             bagging_metrics['RMSE'], boosting_metrics['AdaBoost']['RMSE'], 
             boosting_metrics['Gradient Boosting']['RMSE'], hard_voting_metrics['RMSE'],
             soft_voting_metrics['RMSE'], stacking_metrics['RMSE']],
    'MAE': [model_metrics['Linear Regression']['MAE'], model_metrics['Decision Tree']['MAE'],
            model_metrics['Random Forest']['MAE'], model_metrics['KNN']['MAE'],
            bagging_metrics['MAE'], boosting_metrics['AdaBoost']['MAE'], 
            boosting_metrics['Gradient Boosting']['MAE'], hard_voting_metrics['MAE'],
            soft_voting_metrics['MAE'], stacking_metrics['MAE']],
    'R²': [model_metrics['Linear Regression']['R²'], model_metrics['Decision Tree']['R²'],
           model_metrics['Random Forest']['R²'], model_metrics['KNN']['R²'], 
           bagging_metrics['R²'], boosting_metrics['AdaBoost']['R²'], 
           boosting_metrics['Gradient Boosting']['R²'], hard_voting_metrics['R²'],
           soft_voting_metrics['R²'], stacking_metrics['R²']]
}

# Create a DataFrame to display the comparison
metrics_df = pd.DataFrame(metrics_summary)
metrics_df

# 10. Visualization: Bar plot comparing MSE, RMSE, R² scores
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='MSE', data=metrics_df)
plt.xticks(rotation=90)
plt.title('Model Comparison by MSE')
plt.show()
