In [2]:
import statsmodels.api as sm
import numpy as np
from datetime import time
import itertools

import pandas as pd

## Function: processSubset - Linear Regression Model Fitting and RMSE Calculation

In [None]:
# create a function called processSubset to fit the models and return information about
# the model as well as the RMSE



def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    #X=sm.add_constant(X[list(feature_set)])
    model = sm.OLS(y,sm.add_constant(X[list(feature_set)]))
    regr = model.fit()
    RMSE = np.sqrt(((regr.predict(sm.add_constant(X[list(feature_set)])) - y) ** 2).mean())
    return {"model":regr, "RMSE":RMSE}

## Function: getBest - Finding the Best Linear Regression Model with k Predictors

In [None]:



def getBest(k):
    # start tracking the time 
    tic = time.time()
    # create an array to put the results
    results = []
    
    # do all combinations of predictors 
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the smallest RMSE
    best_model = models.loc[models['RMSE'].argmin()]
    
    # stop tracking the time
    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

### Iterating Through Predictor Counts to Identify Optimal Models

In [None]:
# Could take quite awhile to complete...

models_best = pd.DataFrame(columns=["RMSE", "model"])

tic = time.time()
for i in range(1,7):
    models_best.loc[i] = getBest(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
models_best

In [None]:
# here is the best model with 2 predictors is counted as a predictor
print(models_best.loc[2, "model"].summary())

## Function: getBestDT - Finding the Best Decision Tree Regression Model with k Predictors

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

def processSubsetDT(feature_set):
    model = DecisionTreeRegressor()
    model.fit(X[list(feature_set)], y)
    y_pred = model.predict(X[list(feature_set)])
    RMSE = np.sqrt(mean_squared_error(y, y_pred))
    return {"model": model, "RMSE": RMSE}

def getBestDT(k):
    tic = time.time()
    results = []
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubsetDT(combo))
    models = pd.DataFrame(results)
    best_model = models.loc[models['RMSE'].argmin()]
    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    return best_model

In [None]:
models_best_DT = pd.DataFrame(columns=["RMSE", "model"])

tic = time.time()
for i in range(1,7):
    models_best_DT.loc[i] = getBestDT(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
print(models_best_DT)

## Function: specificity_score - Calculating Specificity for Binary Classification Models

In [None]:

def specificity_score(y_true, y_pred):
    """
    Calculate specificity.

    Args:
        y_true (list or numpy.ndarray): True labels.
        y_pred (list or numpy.ndarray): Predicted labels.

    Returns:
        float: Specificity score.
    """
    true_negatives = sum((y_true == 0) & (y_pred == 0))
    false_positives = sum((y_true == 0) & (y_pred == 1))
    
    if (true_negatives + false_positives) == 0:
      return 0.0
    
    specificity = true_negatives / (true_negatives + false_positives)
    return specificity