<table class="table table-bordered">
    <tr>
       <th style="text-align:center;"><h3>IS217 Practice 1 - Pandas</h3></th>
    </tr>
</table>

### Learning Outcomes

At the end of this lesson, you should be able to:
<ul>
<li>Import Pandas Library</li>
<li>Create and manipulate Pandas Series (with custom indices)</li>
<li>Generate descriptive statistics for Pandas Series</li>
<li>Create Pandas DataFrames (with custom indices)</li>
<li>Access selectively some rows and columns of a DataFrame</li>
<li>Generate descriptive statistics for Pandas DataFrames</li>
</ul>

### Importing the Pandas Library

In [1]:
# ==============================================================================
# 0. Setup and Imports
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import resample
from sklearn.exceptions import ConvergenceWarning
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
import time

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)# Set random seed as required (888)
np.random.seed(888)

In [2]:
# Set random seed as required (888)
np.random.seed(888)

In [3]:
# ==============================================================================
# 1. Data Loading and Cleaning
# ==============================================================================
print("Loading and cleaning data...")
try:
    df = pd.read_csv('HDBResaleFlatPrices.csv')
except FileNotFoundError:
    print("ERROR: HDBResaleFlatPrices.csv not found. Please ensure the file is in the directory.")
    raise

Loading and cleaning data...


In [4]:
df.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price'],
      dtype='object')

In [5]:
# ==============================================================================
# 2. Feature Engineering & Transformation
# ==============================================================================
print("Starting Feature Engineering...")

# --- 2.1 Engineer Time-Based Features ---
df['sale_year'] = pd.to_datetime(df['month'], format='%Y-%m').dt.year
df['remaining_lease'] = (df['lease_commence_date'] + 99) - df['sale_year']

# --- 2.2 Engineer Ordinal Features ---
def parse_storey(storey_range):
    parts = storey_range.split(' TO ')
    return (int(parts[0]) + int(parts[1])) / 2
df['storey_avg'] = df['storey_range'].apply(parse_storey)

# --- 2.3 Define Target ---
TARGET_COLUMN = 'resale_price'

Starting Feature Engineering...


In [6]:
# ==============================================================================
# 3. BRUTE FORCE SCENARIO DEFINITION
# ==============================================================================

# --- 3.1 Define Feature Sets to Test ---
feature_sets = {
    "Full_Set": {
        "num": ['floor_area_sqm', 'remaining_lease', 'sale_year', 'storey_avg'],
        "cat": ['town', 'flat_type', 'flat_model']
    },
    "Simple_Set": {
        "num": ['floor_area_sqm', 'remaining_lease'],
        "cat": ['town']
    },
    "No_Year_Set": {
        "num": ['floor_area_sqm', 'remaining_lease', 'storey_avg'],
        "cat": ['town', 'flat_type', 'flat_model']
    }
}

# --- 3.2 Define Train/Test Splits to Test ---
test_splits = [0.3, 0.2] # (70/30 and 80/20)

# --- 3.3 Master Results List ---
grand_results = []
kf = KFold(n_splits=5, shuffle=True, random_state=888)

# --- Helper function to evaluate models (UPDATED to show Train/Test RMSE) ---
def evaluate_model(name, pipeline, params, X_train, y_train_log, X_test, y_test_log):
    gs = GridSearchCV(pipeline, params, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
    gs.fit(X_train, y_train_log)
    best_model = gs.best_estimator_

    # Predict on TRAIN set
    y_pred_log_train = best_model.predict(X_train)
    y_pred_orig_train = np.expm1(y_pred_log_train)
    y_train_orig = np.expm1(y_train_log)
    rmse_train = np.sqrt(mean_squared_error(y_train_orig, y_pred_orig_train))
    r2_train = r2_score(y_train_orig, y_pred_orig_train)

    # Predict on TEST set
    y_pred_log_test = best_model.predict(X_test)
    y_pred_orig_test = np.expm1(y_pred_log_test)
    y_test_orig = np.expm1(y_test_log)
    rmse_test = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig_test))
    r2_test = r2_score(y_test_orig, y_pred_orig_test)

    print(f"  {name} Complete. Train RMSE: ${rmse_train:,.2f} | Test RMSE: ${rmse_test:,.2f} | Test R2: {r2_test:.4f}")
    return {
        'Train_RMSE': rmse_train, 'Test_RMSE': rmse_test,
        'Train_R2': r2_train, 'Test_R2': r2_test,
        'Best_Params': gs.best_params_
    }

In [7]:
# ==============================================================================
# 4. START BRUTE FORCE LOOP
# ==============================================================================
overall_start_time = time.time()

for set_name, features in feature_sets.items():
    for test_size in test_splits:

        scenario_name = f"{set_name} (Split: {int((1-test_size)*100)}/{int(test_size*100)})"
        print(f"\n--- RUNNING SCENARIO: {scenario_name} ---")

        # --- 4.1 Define Features for this Scenario ---
        NUMERICAL_FEATURES = features['num']
        CATEGORICAL_FEATURES = features['cat']
        ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

        # --- 4.2 Clean and Split Data for this Scenario ---
        df_clean = df[ALL_FEATURES + [TARGET_COLUMN]].dropna()
        X = df_clean[ALL_FEATURES]
        y = df_clean[TARGET_COLUMN]
        y_log = np.log1p(y)

        X_train, X_test, y_train_log, y_test_log = train_test_split(
            X, y_log, test_size=test_size, random_state=888
        )
        y_train_orig = np.expm1(y_train_log)
        y_test_orig = np.expm1(y_test_log)

        # --- 4.3 Define Preprocessing Pipelines for this Scenario ---
        power_transformer_pipeline = Pipeline(steps=[
            ('power', PowerTransformer(method='yeo-johnson')),
            ('scaler', StandardScaler())
        ])
        preprocessor_power = ColumnTransformer(
            transformers=[
                ('num', power_transformer_pipeline, NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATEGORICAL_FEATURES)
            ], remainder='passthrough'
        )
        preprocessor_tree = ColumnTransformer(
            transformers=[
                ('num', 'passthrough', NUMERICAL_FEATURES),
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATEGORICAL_FEATURES)
            ], remainder='passthrough'
        )
        # Pre-process data for non-pipeline models (Stepwise, etc.)
        X_train_processed_power = preprocessor_power.fit_transform(X_train)
        X_test_processed_power = preprocessor_power.transform(X_test)

        # --- 4.4 Run Model Gauntlet for this Scenario ---
        model_results = {}

        # --- A. Linear Regression (OLS) ---
        ols_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', LinearRegression())])
        model_results['OLS'] = evaluate_model('OLS', ols_pipeline, {}, X_train, y_train_log, X_test, y_test_log)

        # --- B. Ridge Regression ---
        ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', Ridge(random_state=888))])
        model_results['Ridge'] = evaluate_model('Ridge', ridge_pipeline, {'regressor__alpha': np.logspace(-2, 3, 6)}, X_train, y_train_log, X_test, y_test_log)

        # --- C. Lasso Regression ---
        lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', Lasso(random_state=888, max_iter=2000))])
        model_results['Lasso'] = evaluate_model('Lasso', lasso_pipeline, {'regressor__alpha': np.logspace(-4, -1, 5)}, X_train, y_train_log, X_test, y_test_log)

        # --- D. Elastic Net Regression ---
        elastic_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', ElasticNet(random_state=888, max_iter=2000))])
        model_results['Elastic Net'] = evaluate_model('Elastic Net', elastic_pipeline, {'regressor__alpha': [0.001], 'regressor__l1_ratio': [0.5]}, X_train, y_train_log, X_test, y_test_log)

        # --- E. K-Nearest Neighbors (KNN) Regressor ---
        knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', KNeighborsRegressor())])
        model_results['KNN'] = evaluate_model('KNN', knn_pipeline, {'regressor__n_neighbors': [10]}, X_train, y_train_log, X_test, y_test_log)

        # --- F. Bagging Regressor ---
        bagging_pipeline = Pipeline(steps=[('preprocessor', preprocessor_tree), ('regressor', BaggingRegressor(random_state=888))])
        model_results['Bagging'] = evaluate_model('Bagging', bagging_pipeline, {'regressor__n_estimators': [50]}, X_train, y_train_log, X_test, y_test_log)

        # --- G. Random Forest Regressor ---
        rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor_tree), ('regressor', RandomForestRegressor(random_state=888))])
        model_results['Random Forest'] = evaluate_model('Random Forest', rf_pipeline, {'regressor__n_estimators': [100], 'regressor__max_depth': [20]}, X_train, y_train_log, X_test, y_test_log)

        # --- H. Neural Network (MLP Regressor) ---
        mlp_pipeline = Pipeline(steps=[('preprocessor', preprocessor_power), ('regressor', MLPRegressor(random_state=888, max_iter=500, early_stopping=True))])
        model_results['Neural Network'] = evaluate_model('Neural Network', mlp_pipeline, {'regressor__hidden_layer_sizes': [(100, 50)], 'regressor__alpha': [0.001]}, X_train, y_train_log, X_test, y_test_log)

        # --- WARNING: The following models are EXTREMELY slow ---

        # --- I. Polynomial / Interaction Regression ---
        print(f"  Fitting Polynomial (Interaction) Model... (This may be slow)")
        poly_pipeline_full = Pipeline(steps=[
            ('preprocessor', preprocessor_power), # Use the 'power' preprocessor
            ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)),
            ('regressor', LinearRegression())
        ])
        poly_pipeline_full.fit(X_train, y_train_log)
        y_pred_log_poly_train = poly_pipeline_full.predict(X_train)
        y_pred_log_poly_test = poly_pipeline_full.predict(X_test)
        rmse_poly_train = np.sqrt(mean_squared_error(y_train_orig, np.expm1(y_pred_log_poly_train)))
        rmse_poly_test = np.sqrt(mean_squared_error(y_test_orig, np.expm1(y_pred_log_poly_test)))
        r2_poly_train = r2_score(y_train_orig, np.expm1(y_pred_log_poly_train))
        r2_poly_test = r2_score(y_test_orig, np.expm1(y_pred_log_poly_test))
        model_results['Polynomial/Interaction'] = {'Train_RMSE': rmse_poly_train, 'Test_RMSE': rmse_poly_test, 'Train_R2': r2_poly_train, 'Test_R2': r2_poly_test, 'Best_Params': 'degree=2'}
        print(f"  Polynomial/Interaction Complete. Train RMSE: ${rmse_poly_train:,.2f} | Test RMSE: ${rmse_poly_test:,.2f} | Test R2: {r2_poly_test:.4f}")

        # --- J. Forward Stepwise Regression ---
        print(f"  Fitting Forward Stepwise Model... (This is very slow)")
        lr = LinearRegression()
        sfs_forward = SequentialFeatureSelector(lr, n_features_to_select='auto', direction='forward', tol=None, cv=3, n_jobs=-1) # cv=3 for speed
        sfs_forward.fit(X_train_processed_power, y_train_log)
        X_train_sfs_f = sfs_forward.transform(X_train_processed_power)
        X_test_sfs_f = sfs_forward.transform(X_test_processed_power)

        lr_sfs_f = LinearRegression().fit(X_train_sfs_f, y_train_log)
        y_pred_log_sfs_f_train = lr_sfs_f.predict(X_train_sfs_f)
        y_pred_log_sfs_f_test = lr_sfs_f.predict(X_test_sfs_f)
        rmse_sfs_f_train = np.sqrt(mean_squared_error(y_train_orig, np.expm1(y_pred_log_sfs_f_train)))
        rmse_sfs_f_test = np.sqrt(mean_squared_error(y_test_orig, np.expm1(y_pred_log_sfs_f_test)))
        r2_sfs_f_train = r2_score(y_train_orig, np.expm1(y_pred_log_sfs_f_train))
        r2_sfs_f_test = r2_score(y_test_orig, np.expm1(y_pred_log_sfs_f_test))
        model_results['Forward Stepwise'] = {'Train_RMSE': rmse_sfs_f_train, 'Test_RMSE': rmse_sfs_f_test, 'Train_R2': r2_sfs_f_train, 'Test_R2': r2_sfs_f_test, 'Best_Params': f'{X_train_sfs_f.shape[1]} features'}
        print(f"  Forward Stepwise Complete. Train RMSE: ${rmse_sfs_f_train:,.2f} | Test RMSE: ${rmse_sfs_f_test:,.2f} | Test R2: {r2_sfs_f_test:.4f}")

        # --- K. Backward Stepwise Regression ---
        print(f"  Fitting Backward Stepwise Model... (This is EXTREMELY slow, will cap at 40 features)")
        lr_back = LinearRegression()
        
        # Reduce memory usage by:
        # 1. Limiting the number of features even more
        n_features_backward = min(20, X_train_processed_power.shape[1] - 1)  # Reduced from 50 to 20
        
        # 2. Use a smaller subset of data for feature selection if dataset is large
        sample_size = min(10000, X_train_processed_power.shape[0])
        if X_train_processed_power.shape[0] > sample_size:
            # Create a random sample for feature selection
            random_indices = np.random.choice(X_train_processed_power.shape[0], sample_size, replace=False)
            X_train_sample = X_train_processed_power[random_indices]
            y_train_sample = y_train_log[random_indices]
        else:
            X_train_sample = X_train_processed_power
            y_train_sample = y_train_log
        
        # 3. Reduce number of cross-validation folds
        sfs_backward = SequentialFeatureSelector(lr_back, n_features_to_select=n_features_backward, 
                                               direction='backward', cv=2, n_jobs=1)  # Reduced CV and using single job
        
        # Fit on the sample
        sfs_backward.fit(X_train_sample, y_train_sample)
        X_train_sfs_b = sfs_backward.transform(X_train_processed_power)
        X_test_sfs_b = sfs_backward.transform(X_test_processed_power)

        lr_sfs_b = LinearRegression().fit(X_train_sfs_b, y_train_log)
        y_pred_log_sfs_b_train = lr_sfs_b.predict(X_train_sfs_b)
        y_pred_log_sfs_b_test = lr_sfs_b.predict(X_test_sfs_b)
        rmse_sfs_b_train = np.sqrt(mean_squared_error(y_train_orig, np.expm1(y_pred_log_sfs_b_train)))
        rmse_sfs_b_test = np.sqrt(mean_squared_error(y_test_orig, np.expm1(y_pred_log_sfs_b_test)))
        r2_sfs_b_train = r2_score(y_train_orig, np.expm1(y_pred_log_sfs_b_train))
        r2_sfs_b_test = r2_score(y_test_orig, np.expm1(y_pred_log_sfs_b_test))
        model_results['Backward Stepwise'] = {'Train_RMSE': rmse_sfs_b_train, 'Test_RMSE': rmse_sfs_b_test, 'Train_R2': r2_sfs_b_train, 'Test_R2': r2_sfs_b_test, 'Best_Params': f'{X_train_sfs_b.shape[1]} features'}
        print(f"  Backward Stepwise Complete. Train RMSE: ${rmse_sfs_b_train:,.2f} | Test RMSE: ${rmse_sfs_b_test:,.2f} | Test R2: {r2_sfs_b_test:.4f}")

        # --- 4.5 Store Scenario Results ---
        scenario_df = pd.DataFrame(model_results).T
        scenario_df['scenario'] = scenario_name
        scenario_df['model'] = scenario_df.index
        grand_results.append(scenario_df)

        print(f"--- SCENARIO COMPLETE: {scenario_name} ---")


--- RUNNING SCENARIO: Full_Set (Split: 70/30) ---
  OLS Complete. Train RMSE: $70,999.13 | Test RMSE: $71,224.58 | Test R2: 0.8451
  Ridge Complete. Train RMSE: $70,999.13 | Test RMSE: $71,224.66 | Test R2: 0.8451
  Lasso Complete. Train RMSE: $71,361.68 | Test RMSE: $71,574.23 | Test R2: 0.8436
  Elastic Net Complete. Train RMSE: $73,347.54 | Test RMSE: $73,512.00 | Test R2: 0.8350
  KNN Complete. Train RMSE: $31,043.48 | Test RMSE: $34,634.15 | Test R2: 0.9634
  Bagging Complete. Train RMSE: $17,464.32 | Test RMSE: $28,883.62 | Test R2: 0.9745
  Random Forest Complete. Train RMSE: $31,090.20 | Test RMSE: $34,638.33 | Test R2: 0.9634
  Neural Network Complete. Train RMSE: $33,075.60 | Test RMSE: $33,331.65 | Test R2: 0.9661
  Fitting Polynomial (Interaction) Model... (This may be slow)


MemoryError: Unable to allocate 12.6 GiB for an array with shape (609000, 2774) and data type float64

In [None]:
# ==============================================================================
# 5. FINAL BRUTE FORCE RESULTS
# ==============================================================================
print("\n" + "="*50)
print("BRUTE FORCE ANALYSIS COMPLETE")
print(f"Total time taken: {(time.time() - overall_start_time)/60:.2f} minutes")
print("="*50)

# Concatenate all results into one big DataFrame
final_results_df = pd.concat(grand_results)

# --- 5.1 Print the Grand Champion Table ---
print("\n--- Grand Results Table (All Scenarios) ---")
final_table = final_results_df[
    ['scenario', 'model', 'Train_RMSE', 'Test_RMSE', 'Test_R2']
].sort_values(by='Test_RMSE')
print(final_table.to_string()) # .to_string() prints the full table

# --- 5.2 Find and Recommend the Absolute Best Model ---
best_idx = final_results_df['Test_RMSE'].idxmin()
best_overall_run = final_results_df.loc[best_idx]

print("\n" + "="*50)
print("FINAL RECOMMENDATION (GRAND CHAMPION)")
print("="*50)

print(f"The best overall model/scenario combination is:")
print(f"  Scenario: **{best_overall_run['scenario']}**")
print(f"  Model: **{best_overall_run['model']}**")

# Convert values to Python scalars before formatting
print(f"\n  Lowest Test RMSE: **${best_overall_run['Test_RMSE'].values[0]:,.2f}**")
print(f"  Test R-squared: **{best_overall_run['Test_R2'].values[0]:.4f}**")
print(f"  Train RMSE: ${best_overall_run['Train_RMSE'].values[0]:,.2f} (Check for overfitting)")


# Handle the Best_Params which might be a complex object
print(f"  Best parameters: {best_overall_run.get('Best_Params', 'N/A')}")

print("\nJustification: After running a comprehensive tournament of 11 models across 6 different")
print(f"scenarios (3 feature sets x 2 train/test splits), the combination of")
print(f"**{best_overall_run['model']}** on the **'{best_overall_run['scenario']}'**")
print(f"provided the lowest test error, indicating the best combination")
print(f"of features, split, and algorithm for generalization.")

In [None]:
# ==============================================================================
# 6. FINAL DEEP DIVE on the Winning Scenario
# ==============================================================================
print("\n" + "="*50)
print(f"STARTING DEEP DIVE ON WINNING SCENARIO: {best_overall_run['scenario']}")
print("="*50)

# --- 6.1 Re-create the winning scenario's data ---
# First, access the specific string value from the 'scenario' Series/column
# We use .iloc[0] to get the value from the first row
scenario_string = best_overall_run['scenario'].iloc[0]

# Now you can safely split the string
winning_set_name = scenario_string.split(" (")[0]
winning_split = float(scenario_string.split("/")[1][:2]) / 100.0
winning_features = feature_sets[winning_set_name]

# You can print these to verify
print(f"Winning Set Name: {winning_set_name}")
print(f"Winning Split: {winning_split}")

NUMERICAL_FEATURES = winning_features['num']
CATEGORICAL_FEATURES = winning_features['cat']
ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
df_clean = df[ALL_FEATURES + [TARGET_COLUMN]].dropna()
X = df_clean[ALL_FEATURES]
y = df_clean[TARGET_COLUMN]
y_log = np.log1p(y)

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=winning_split, random_state=888
)

# --- 6.2 Re-create the winning preprocessor ---
preprocessor_power_final = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('power', PowerTransformer(method='yeo-johnson')), ('scaler', StandardScaler())]), NUMERICAL_FEATURES),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATEGORICAL_FEATURES)
    ], remainder='passthrough'
)

# --- 6.3 OLS Summary (Equivalent to R's summary(lm_model)) ---
print("\n--- OLS Model Summary (from statsmodels) ---")
X_train_processed_power = preprocessor_power_final.fit_transform(X_train)
X_test_processed_power = preprocessor_power_final.transform(X_test)
feature_names_out = preprocessor_power_final.get_feature_names_out()
X_train_sm = sm.add_constant(X_train_processed_power, prepend=False)

sm_model = sm.OLS(y_train_log, X_train_sm).fit()
print(sm_model.summary(xname=['const'] + list(feature_names_out)))
print("\nInterpretation: Review p-values (P>|t|) to see which features are statistically significant.")

# --- Residual Plots ---
y_pred_log_train = sm_model.predict(X_train_sm)
ols_residuals_log = y_train_log - y_pred_log_train
plt.figure(figsize=(8, 4))
sns.scatterplot(x=y_pred_log_train, y=ols_residuals_log, alpha=0.5)
plt.hlines(y=0, xmin=min(y_pred_log_train), xmax=max(y_pred_log_train), color='red', linestyle='--')
plt.title('Residuals vs Fitted Values (Log Scale)')
plt.show()

sm.qqplot(ols_residuals_log, line='45', fit=True)
plt.title('Q-Q Plot of Residuals (Log Scale)')
plt.show()
print("Interpretation: Check plots for randomness and normality.")

# --- 6.5 Multicollinearity (VIF) ---
print("\n--- Calculating VIF scores ---")
# Make sure we're using a list of column names here
X_train_processed_df = pd.DataFrame(X_train_processed_power, columns=feature_names_out)
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_processed_df.columns
vif_scores = []
for i in range(X_train_processed_df.shape[1]):
    vif_scores.append(variance_inflation_factor(X_train_processed_df.values, i))
vif_data["VIF"] = vif_scores
print("\nVIF Scores (Top 10):")
print(vif_data.sort_values(by='VIF', ascending=False))
print("Interpretation: VIF scores > 5 or 10 suggest multicollinearity.")

# --- 6.6 Bootstrap for Coefficient Stability ---
print("\n--- Bootstrapping OLS Coefficients (100 iterations) ---")
n_iterations = 100
n_size = int(len(X_train_processed_power) * 0.50)
boot_coefs = []
for i in range(n_iterations):
    X_sample, y_sample = resample(X_train_processed_power, y_train_log, n_samples=n_size, random_state=i)
    boot_model = LinearRegression().fit(X_sample, y_sample)
    boot_coefs.append(boot_model.coef_)

boot_coefs_df = pd.DataFrame(boot_coefs, columns=feature_names_out)
print("Bootstrap Coefficient Summary (Mean and Std Dev):")
print(boot_coefs_df.describe().loc[['mean', 'std']].T)
print("Interpretation: Low standard deviations suggest stable coefficients.")

print("\n--- FULL ANALYSIS COMPLETE ---")