## Import Stuff

In [1]:
pip install pandas numpy matplotlib seaborn jupyter scikit-learn pdfplumber geopandas contextily fiona


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [110]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge



## Cleaning Dataset

In [140]:
cost = pd.read_csv('interconnection_costs.csv')
cost = cost.drop(columns=['Project # in Queued Up', '$2024 Total Cost/kW', 'POI Transmission Line', 'Project #'])

In [141]:
cols_to_remove_na = ['Latitude of POI', 'Longitude of POI', 'Queue Date', 'Upgrade of Existing Generator', 
                     'Substation Newly Built', 'State', 'County', 'Study Date', 'Transmission Voltage', 'Nameplate MW']
for i in cols_to_remove_na:
    cost = cost[~cost[i].isna()]

cost_cleaned = cost.copy()

In [142]:
for i in ["Study Date", "Queue Date"]:
    cost_cleaned[i] = pd.to_datetime(cost_cleaned[i], errors="coerce")
    cost_cleaned[i[0] + "_" + "Year"] = cost_cleaned[i].dt.year
    cost_cleaned[i[0] + "_" + "Month"] = cost_cleaned[i].dt.month
    cost_cleaned[i[0] + "_" + "Day"] = cost_cleaned[i].dt.day
     #separated each date value to own column

cost_cleaned = cost_cleaned.drop(columns=[ "Study Date", "Queue Date", "S_Year"])
cat_fillna = ['Study Type', 'Service Type', 'Restudy', 'Revision of Study']
cost_cleaned[cat_fillna] = cost_cleaned[cat_fillna].fillna('Unknown')
cost_cleaned = cost_cleaned[cost_cleaned["Transmission Voltage"].astype(str).str.match(r"^\d+(\.\d+)?$")]
cost_cleaned["Transmission Voltage"] = cost_cleaned["Transmission Voltage"].astype(float)



  cost_cleaned[i] = pd.to_datetime(cost_cleaned[i], errors="coerce")
  cost_cleaned[i] = pd.to_datetime(cost_cleaned[i], errors="coerce")


In [143]:
cost_cleaned.columns

Index(['Balancing Authority', 'BA', 'State', 'County',
       'Upgrade of Existing Generator', 'Latitude of POI', 'Longitude of POI',
       'Substation Newly Built', 'Transmission Voltage', 'Study Type',
       'Study Year', 'Restudy', 'Revision of Study', 'Fuel', 'Nameplate MW',
       'Request Status', 'Service Type', '$2024 POI Cost/kW',
       '$2024 Network Cost/kW', 'S_Month', 'S_Day', 'Q_Year', 'Q_Month',
       'Q_Day'],
      dtype='object')

In [147]:
cost_cleaned.to_csv("interconnection_costs_cleaned.csv")


## EDA

In [120]:
target = "$2024 POI Cost/kW"


X = cost_cleaned.drop(columns=[target])
y = cost_cleaned[target]

#Preprocessing
num_features = ['Latitude of POI', 'Longitude of POI', 'Study Year', 'Transmission Voltage', 'Study Year', 'Nameplate MW', '$2024 Network Cost/kW', 'S_Month', 'S_Day', 'Q_Year', 'Q_Month', 'Q_Day']
cat_features = ['Balancing Authority', 'BA', 'State', 'County', 'Upgrade of Existing Generator',  'Substation Newly Built', 'Study Type', 'Restudy', 'Revision of Study', 'Fuel', 'Request Status', 'Service Type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=67
)

In [139]:
cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer([
    ("cat", cat_pipeline, cat_features),
    ("num", "passthrough", num_features)
])

model = Pipeline([
    ("preprocess", preprocess),
    ("ridge", Ridge())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)

203972.64842007525


In [135]:
models = {
    # "RandomForest": {
    #     "model": RandomForestRegressor(),
    #     "params": {
    #         "n_estimators": [50, 100, 200, 300],
    #         "max_depth": [5, 10, None, 15, 3],
    #         "min_samples_split": [2, 5, 6, 9],
    #         "min_samples_leaf": [1, 2, 7, 4],
    #     }
    # },
    "RidgeRegression": {
        "model": Ridge(),
        "params": {
            "alpha": [0.01, 0.1, 1, 10, 20, 100, 200],
            "solver": ["auto", "lsqr", "sparse_cg", "sag", "lbfgs"],  # remove 'svd'
        }
    }
    # },
    # "GradientBoosting": {
    #     "model": GradientBoostingRegressor(),
    #     "params": {
    #         "n_estimators": [50, 100, 200, 300],
    #         "learning_rate": [0.01, 0.05, 0.07, 0.08, 0.1],
    #         "max_depth": [3, 4, 5, 7],
    #         "subsample": [0.8, 1.0, 0.5, 0.9]
    #     }
    # }
}

#OHE

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer([
    ("cat", cat_pipeline, cat_features),
    ("num", "passthrough", num_features)
])

#models

best_models = {}

for name, m in models.items():
    print(f"Running GridSearchCV for {name}...")
    
    pipeline = Pipeline([
        ("preprocess", preprocess),
        ("model", m["model"])
    ])
    
    grid = GridSearchCV(
        pipeline,
        param_grid={"model__" + k: v for k, v in m["params"].items()},
        cv=5,
        scoring="neg_mean_absolute_error",   # for the regression metric
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    best_models[name] = grid

    y_pred = grid.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"Best RMSE for {name}: {rmse:.4f}")
    print(f"Best R² for {name}: {r2:.4f}")
    print(f"Best parameters: {grid.best_params_}\n")


Running GridSearchCV for RidgeRegression...
Best RMSE for RidgeRegression: 453.4328
Best R² for RidgeRegression: -4.6806
Best parameters: {'model__alpha': 200, 'model__solver': 'auto'}



35 fits failed out of a total of 175.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/pipeline.py", line 663, in fit
    self._final_estimat

In [None]:
#best parameters: Ridge Regression: alpha 100, model solver lsqr
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=67
)
ridge_model = Ridge(alpha=100, solver='lsqr')
ridge_model.fit(X_train, y_train)
ridge_pred = ridge_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Best RMSE for {name}: {rmse:.4f}")
print(f"Best R² for {name}: {r2:.4f}")

## Sample Code

In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

df = cost_cleaned.copy()

# Replace any null values in categorical columns
cat_cols = ['Balancing Authority', 'BA', 'State', 'County', 'Upgrade of Existing Generator',  
            'Substation Newly Built', 'Study Type', 'Restudy', 'Revision of Study', 
            'Fuel', 'Request Status', 'Service Type']
for col in cat_cols:
    df[col] = df[col].fillna("Unknown")

# Convert Transmission Voltage to numeric, remove range values like '230/115'
df['Transmission Voltage'] = pd.to_numeric(df['Transmission Voltage'].astype(str).str.split(r'[-/]').str[0], errors='coerce')

# Define target and features
target = '$2024 POI Cost/kW'
X = df.drop(columns=[target])
y = df[target]

# Identify numeric and categorical features
num_features = ['Latitude of POI', 'Longitude of POI', 'Transmission Voltage', 
                'Study Year', 'Nameplate MW', '$2024 Network Cost/kW', 
                'S_Month', 'S_Day', 'Q_Year', 'Q_Month', 'Q_Day']
cat_features = cat_cols

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=67)

# Preprocessing pipelines
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # dense to avoid sparse issues
])

preprocess = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Define models and their hyperparameters
models = {
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [5, 10, None]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    },
    "Ridge": {
        "model": Ridge(),
        "params": {
            "alpha": [1, 10, 100],
            "solver": ["auto", "lsqr", "sparse_cg"]
        }
    }
}

best_models = {}

# Train models using GridSearchCV
for name, m in models.items():
    print(f"Running GridSearchCV for {name}...")
    
    pipeline = Pipeline([
        ('preprocess', preprocess),
        ('model', m['model'])
    ])
    
    grid = GridSearchCV(
        pipeline,
        param_grid={"model__" + k: v for k, v in m["params"].items()},
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    best_models[name] = grid
    
    # Evaluate
    y_pred = grid.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} Best RMSE: {rmse:.4f}")
    print(f"{name} Best R²: {r2:.4f}")
    print(f"{name} Best Parameters: {grid.best_params_}\n")


Running GridSearchCV for RandomForest...
RandomForest Best RMSE: 616.3401
RandomForest Best R²: -9.4957
RandomForest Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}

Running GridSearchCV for GradientBoosting...
GradientBoosting Best RMSE: 1091.7786
GradientBoosting Best R²: -31.9336
GradientBoosting Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}

Running GridSearchCV for Ridge...
Ridge Best RMSE: 417.3975
Ridge Best R²: -3.8136
Ridge Best Parameters: {'model__alpha': 100, 'model__solver': 'lsqr'}

