## 1. Supervised Learning

### (a)

Defining the EN loss function as

$$
\sum_{i=1}^n(Y_i-\beta X_i)^2+\lambda_1||\beta||_2^2+\lambda_2||\beta||_1
$$

EN reduces to a Lasso regression when $\lambda_2 > 0$ and $\lambda_1 = 0$, and it reduces to Ridge when when $\lambda_2 = 0$ and $\lambda_1 > 0$

### (b)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder, scale, StandardScaler
import statsmodels.api as sm
from itertools import product
from joblib import Parallel, delayed

# define all of the objects we need
%run functions/prepare_ames_data.py


pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 100

In [2]:
X = ames_train_preprocessed.drop(columns=['saleprice'])
X_val = ames_val_preprocessed.drop(columns=['saleprice'])

# scale the predictors
X_std = (X - X.mean()) / X.std()
X_val_std = (X_val - X.mean()) / X.std()
y = ames_train_preprocessed['saleprice']
y_val = ames_val_preprocessed['saleprice']

For the sake of saving computational resources, I will only find the optimal $\lambda$ value through cross-fitting, and not the weight of the penalty terms, which will be set to the default value of 0.5.

In [3]:
# use 10-fold cross-validation to select the best lambda (alpha) value for the elastic_net regression model

# define the alpha values to test
# note that the start/stop values in the first two arguments are the exponents


alphas = np.logspace(-1, 6, 100)

# create an empty list to store the cross-validation scores
elastic_net_cv_scores = []

# create a for loop to compute the cross-validation score for each alpha value

for alpha in alphas:
    elastic_net = ElasticNet(alpha=alpha)
    elastic_net_cv = cross_validate(estimator=elastic_net,
                                    X=X_std,
                                    y=y,
                                    cv=10,
                                    scoring='neg_root_mean_squared_error')
    elastic_net_cv_scores.append({'alpha': alpha,
                                  'log_alpha': np.log(alpha),
                                  'test_mse': -np.mean(elastic_net_cv['test_score'])})

# convert the cross-validation scores into a data frame
elastic_net_cv_scores_df = pd.DataFrame(elastic_net_cv_scores)



In [4]:
# identify the value of alpha that minimizes the cross-validation score for elastic_net
elastic_net_alpha_min = elastic_net_cv_scores_df.sort_values(by='test_mse').head(1).alpha.values[0]
# compute the min MSE and the SE of the MSE
mse_se_elastic_net = elastic_net_cv_scores_df['test_mse'].std() / np.sqrt(10)
mse_min_elastic_net = elastic_net_cv_scores_df['test_mse'].min()

# identify the value of alpha that minimizes the cross-validation score for elastic_net within 1SE
elastic_net_alpha_1se = elastic_net_cv_scores_df[(elastic_net_cv_scores_df['test_mse'] <= mse_min_elastic_net + mse_se_elastic_net) & 
                                     (elastic_net_cv_scores_df['test_mse'] >= mse_min_elastic_net - mse_se_elastic_net)].sort_values(by='alpha', ascending=False).head(1).alpha.values[0]

In [5]:
print('Elastic Net (min): ', elastic_net_alpha_min)
print('Elastic Net (1SE): ', elastic_net_alpha_1se)

Elastic Net (min):  0.1
Elastic Net (1SE):  3.5938136638046276


#### Fitting Performance

In [6]:
elastic_net_1se_fit = ElasticNet(alpha = elastic_net_alpha_1se).fit(X=X_std, y=y)

In [7]:
elastic_net_pred = elastic_net_1se_fit.predict(X_val_std)

In [8]:
val_results = [{
        "model": "elastic_net",
        "rmse": np.sqrt(np.mean((y_val - elastic_net_pred)**2)),
        "mae": np.mean(abs(y_val - elastic_net_pred)),
        "corr": np.corrcoef(y_val, elastic_net_pred)[0, 1]
}]
pd.DataFrame(val_results)

Unnamed: 0,model,rmse,mae,corr
0,elastic_net,26794.936942,18179.929714,0.931281


#### Stability to Perturbations in Data

In [9]:
def perturb_ames(df):
    # create a copy of the data frame
    df_copy = df.copy()
    # generate a random number between -250 and 250 for 30% of the rows
    sampled_index = df_copy.sample(frac=0.3).index
    df_copy.loc[sampled_index, 'gr_liv_area'] = df_copy.loc[sampled_index, 'gr_liv_area'] + np.random.randint(-250, 250, size=sampled_index.size)
    # conduct bootstrap sample
    df_copy = df_copy.sample(frac=1, replace=True)
    return df_copy

In [10]:
perturbed_ames = [perturb_ames(ames_train_preprocessed) for i in range(100)]

In [11]:
def fit_models(df, reg=True, fit_area_multi=True):
        # standardize predictor variables in df for ridge and lasso
        df_x = df.drop(columns='saleprice')
        df_x_std = (df_x - df_x.mean()) / df_x.std()
        df_y = df['saleprice']

        alphas = np.logspace(-1, 5, 100)
        en_cv_scores = []

        for alpha in alphas:
                elastic_net = ElasticNet(alpha=alpha)
                elastic_net_cv = cross_validate(estimator=elastic_net,
                                                X=df_x_std,
                                                y=df_y,
                                                cv=10,
                                                scoring='neg_root_mean_squared_error')
                en_cv_scores.append({'alpha': alpha,
                                        'log_alpha': np.log(alpha),
                                        'test_mse': -np.mean(elastic_net_cv['test_score'])})

        # convert the cross-validation scores into a data frame
        en_cv_scores_df = pd.DataFrame(elastic_net_cv_scores)
        en_alpha_min = en_cv_scores_df.sort_values(by='test_mse').head(1).alpha.values[0]
        # compute the min MSE and the SE of the MSE
        mse_se_en = en_cv_scores_df['test_mse'].std() / np.sqrt(10)
        mse_min_en = en_cv_scores_df['test_mse'].min()

        # identify the value of alpha that minimizes the cross-validation score for elastic_net within 1SE
        en_alpha_1se = en_cv_scores_df[(en_cv_scores_df['test_mse'] <= mse_min_en + mse_se_en) & 
                                        (en_cv_scores_df['test_mse'] >= mse_min_en - mse_se_en)].sort_values(by='alpha', ascending=False).head(1).alpha.values[0]
        
        elastic_net = ElasticNet(alpha = en_alpha_1se).fit(X=df_x_std, y=df_y)

        return elastic_net

In [12]:
results = Parallel(n_jobs=-1)(delayed(fit_models)(df) for df in perturbed_ames)
en_perturbed = [*results]
en_val_pred_perturbed = [en.predict(X_val_std) for en in en_perturbed]

In [13]:
def plot_prediction_range(pred_list, title=None, sample_index=None):
    if sample_index is None:
        sample_index = list(range(pred_list[0].size))
        
    pred_list = [pred_list[i][sample_index] for i in range(100)]
    pred_list_df = pd.DataFrame(pred_list).T
    pred_list_df['id'] = ames_val_preprocessed.index[sample_index]
    pred_list_df['true'] = ames_val_preprocessed['saleprice'].values[sample_index]
    pred_list_df = pd.melt(pred_list_df, id_vars=['id','true'], var_name='iter', value_name='pred')
    pred_list_df = pred_list_df.groupby(['id', 'true']).agg({'pred': ['min', 'max']})
    pred_list_df = pred_list_df.reset_index()
    pred_list_df = pred_list_df.set_index('id')

    # plot a series of horizontal line segments for each id where the lines range from the minimum and maximum predicted values on the x-axis and have the true value on the y-axis
    fig = go.Figure()

    for i in pred_list_df.index:
        fig.add_trace(
            go.Scatter(x=[pred_list_df.loc[i, ('pred', 'min')], pred_list_df.loc[i, ('pred', 'max')]],
                        y=[pred_list_df.loc[i, 'true'].values[0], pred_list_df.loc[i, 'true'].values[0]],
                        mode='lines',
                        line={'color': 'black'}, 
                        showlegend=False)
            )
    # add a single diagonal line to the plot
    fig.add_trace(
        go.Scatter(x=[0, 400000], y=[0, 400000], mode='lines', line={'color': 'black'}, showlegend=False)
    )
        
    fig.update_layout(xaxis_title='Predicted sale price range',
                        yaxis_title='Observed sale price',
                        title=title)
    return fig

In [14]:
val_sample_id = np.random.choice(ames_val_preprocessed.shape[0], 150, replace=False)
plot_prediction_range(en_val_pred_perturbed, 'Elastic Net', sample_index=val_sample_id)

Similarly to what happens in the case of Lasso and Ridge, the range of predictions seems seems to stabilize thanks to regularization, as compared to an unregularized LS model.

#### Coefficient Importance

In [15]:
def extract_coefficients(fit_perturbed_list, model=None):
    coefs_list = []
    for i in range(100):
        # The single-predictor fit doesn't provide variable names, so we need to manually provide this
        if fit_perturbed_list[0].coef_.shape[0] == 1:
            var_names = 'gr_liv_area'
        else: 
            var_names = fit_perturbed_list[i].feature_names_in_
        
        coefs = pd.DataFrame({'variable': var_names,
                              'coef': fit_perturbed_list[i].coef_,
                              'model': model})
        coefs['iter'] = i
        coefs_list.append(coefs)
    coefs_combined_df = pd.concat(coefs_list)
    return coefs_combined_df

In [16]:
perturbed_std_coefs_en = extract_coefficients(en_perturbed, 'elastic_net')

In [17]:
top_20_coefs_ls_all = ['gr_liv_area', 'overall_qual', 'mas_vnr_area', 'total_bsmt_sf',
       'year_built', 'overall_cond', 'bsmt_exposure', 'lot_frontage',
       'garage_area', 'kitchen_qual', 'bathrooms', 'exter_qual', 'lot_area',
       'basement_finished_rating', 'fireplaces', 'heating_qc',
       'irregular_lot_shape', 'foundation_concrete', 'garage_yr_blt',
       'garage_finish']

In [18]:
# extract the coefficients for the top 20 variables from perturbed_std_coefs_combined and visualize their distributions using boxplots
top_20_coefs_df = perturbed_std_coefs_en.query('variable in @top_20_coefs_ls_all')

fig = px.box(top_20_coefs_df,
             x='variable',
             y='coef',
             facet_col='model',
             facet_col_wrap=1,
             height=1200,
             category_orders={'variable': top_20_coefs_ls_all})
fig.update_traces(width=0.5)
fig

The coefficients are fairly well behaved, and have a similar distribution as the case of Ridge

#### Sensitivity to Judgement Calls

In [19]:
perturb_options = list(product([0.65, 0.8, 0.95], 
                               [10, 20],
                               ['other', 'mode'],
                               [True, False],
                               ['none', 'sqrt'],
                               [0, 0.5],
                               ['numeric', 'simplified_dummy', 'dummy']))
perturb_options = pd.DataFrame(perturb_options, columns=('max_identical_thresh', 
                                                         'n_neighborhoods',
                                                         'impute_missing_categorical',
                                                         'simplify_vars',
                                                         'transform_response',
                                                         'cor_feature_selection_threshold',
                                                         'convert_categorical'))
perturb_options

Unnamed: 0,max_identical_thresh,n_neighborhoods,impute_missing_categorical,simplify_vars,transform_response,cor_feature_selection_threshold,convert_categorical
0,0.65,10,other,True,none,0.0,numeric
1,0.65,10,other,True,none,0.0,simplified_dummy
2,0.65,10,other,True,none,0.0,dummy
3,0.65,10,other,True,none,0.5,numeric
4,0.65,10,other,True,none,0.5,simplified_dummy
...,...,...,...,...,...,...,...
283,0.95,20,mode,False,sqrt,0.0,simplified_dummy
284,0.95,20,mode,False,sqrt,0.0,dummy
285,0.95,20,mode,False,sqrt,0.5,numeric
286,0.95,20,mode,False,sqrt,0.5,simplified_dummy


In [None]:
# conduct judgment call perturbations of training data
ames_jc_perturb = [preprocess_ames_data(ames_train_clean,
                                        max_identical_thresh=perturb_options['max_identical_thresh'][i],
                                        n_neighborhoods=perturb_options['n_neighborhoods'][i],
                                        impute_missing_categorical=perturb_options['impute_missing_categorical'][i],
                                        simplify_vars=perturb_options['simplify_vars'][i],
                                        transform_response=perturb_options['transform_response'][i],
                                        cor_feature_selection_threshold=perturb_options['cor_feature_selection_threshold'][i],
                                        convert_categorical=perturb_options['convert_categorical'][i])
                   for i in range(perturb_options.shape[0])]

# conduct judgment call perturbations of validation data data (we need to make sure each validation set is compartible with the relevant training set)
ames_val_jc_perturb = []
for i in range(perturb_options.shape[0]):
    
    # extract relevant neighborhoods from  relevant training data
    train_neighborhood_cols = list(ames_jc_perturb[i].filter(regex="neighborhood").columns)
    train_neighborhoods = [x.replace("neighborhood_", "") for x in train_neighborhood_cols]
    
    # create preprocessed validation set
    ames_val_jc_perturb.append(
        preprocess_ames_data(ames_val_clean,
                             max_identical_thresh=perturb_options['max_identical_thresh'][i],
                             n_neighborhoods=perturb_options['n_neighborhoods'][i],
                             impute_missing_categorical=perturb_options['impute_missing_categorical'][i],
                             simplify_vars=perturb_options['simplify_vars'][i],
                             transform_response=perturb_options['transform_response'][i],
                             cor_feature_selection_threshold=perturb_options['cor_feature_selection_threshold'][i],
                             convert_categorical=perturb_options['convert_categorical'][i],
                             # make sure val set matches training set
                             column_selection=list(ames_jc_perturb[i].columns),
                             neighborhood_levels=train_neighborhoods)
        )

# create a standardized version of the validation datasets
ames_val_jc_perturb_std = []
for i in range(len(ames_val_jc_perturb)):
    df = ames_val_jc_perturb[i].drop(columns=['saleprice'])
    df_std = (df - df.mean()) / df.std()
    df_std['saleprice'] = ames_val_jc_perturb[i]['saleprice']
    ames_val_jc_perturb_std.append(df_std)

In [21]:
results_jc = Parallel(n_jobs=-1)(delayed(fit_models)(df, fit_area_multi=False) for df in ames_jc_perturb)
en_jc_perturbed = [*results_jc]

In [22]:
# compute the predictions on the validaion set for en_perturbed
en_val_jc_pred_perturbed = [en_jc_perturbed[i].predict(X=ames_val_jc_perturb_std[i].drop(columns='saleprice')) for i in range(len(ames_val_jc_perturb_std))]

# for predictions where the response was sqrt-transformed, undo the sqrt transformation
en_val_jc_pred_perturbed = [pred**2 if perturb_options['transform_response'][i] == 'sqrt' else pred for i, pred in enumerate(en_val_jc_pred_perturbed)]

In [23]:
# compute the correlation between the predictions and the true values for each model
# note that we are using the sale price from the unperturbed validation set because to use the perturbed sale price
# we would need to sqrt-transform the sale price where relevant
# this wouldn't work if any of our judgment call modifications changed the *number* of observations in the data
en_val_jc_corr = [np.corrcoef(ames_val_preprocessed['saleprice'], pred)[0, 1] for pred in en_val_jc_pred_perturbed]

In [24]:
corr_df = pd.DataFrame({'elastic_net': en_val_jc_corr,
                        'max_identical_thresh': perturb_options['max_identical_thresh'],
                        'n_neighborhoods': perturb_options['n_neighborhoods'],
                        'impute_missing_categorical': perturb_options['impute_missing_categorical'],
                        'simplify_vars': perturb_options['simplify_vars'],
                        'transform_response': perturb_options['transform_response'],
                        'cor_feature_selection_threshold': perturb_options['cor_feature_selection_threshold'],
                        'convert_categorical': perturb_options['convert_categorical']})

corr_df = pd.melt(corr_df, id_vars=['max_identical_thresh', 
                                    'n_neighborhoods',
                                    'impute_missing_categorical',
                                    'simplify_vars',
                                    'transform_response',
                                    'cor_feature_selection_threshold',
                                    'convert_categorical'],
                  value_vars=['elastic_net'],
                  var_name='model',
                  value_name='corr')

corr_df.sort_values(by='corr', ascending=False)

Unnamed: 0,max_identical_thresh,n_neighborhoods,impute_missing_categorical,simplify_vars,transform_response,cor_feature_selection_threshold,convert_categorical,model,corr
282,0.95,20,mode,False,sqrt,0.0,numeric,elastic_net,0.954198
186,0.80,20,mode,False,sqrt,0.0,numeric,elastic_net,0.953248
258,0.95,20,other,False,sqrt,0.0,numeric,elastic_net,0.951968
234,0.95,10,mode,False,sqrt,0.0,numeric,elastic_net,0.951825
162,0.80,20,other,False,sqrt,0.0,numeric,elastic_net,0.951693
...,...,...,...,...,...,...,...,...,...
5,0.65,10,other,True,none,0.5,dummy,elastic_net,0.900289
101,0.80,10,other,True,none,0.5,dummy,elastic_net,0.900289
221,0.95,10,mode,True,none,0.5,dummy,elastic_net,0.900289
125,0.80,10,mode,True,none,0.5,dummy,elastic_net,0.900289


In [25]:
corr_df_long = corr_df.melt(id_vars=['corr'],
                            value_vars=['max_identical_thresh',
                                        'n_neighborhoods',
                                        'impute_missing_categorical',
                                        'simplify_vars',
                                        'transform_response',
                                        'cor_feature_selection_threshold',
                                        'convert_categorical'],
                            var_name='judgment_call',
                            value_name='option')
fig = px.box(corr_df_long, x='option', y='corr', 
       # color='model', 
       facet_col='judgment_call', 
       facet_col_wrap=2, 
       height=1200)

# give each plot in the 4 by 2 fig above its own x-axis
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(matches=None)

Like before, transforming the response helps improve the performance of our model by quite a bit.

### (c)

In [26]:
coefs = np.array([(-0.9) ** (i + 1) for i in range(10)])
coef_0 = .5
X = np.random.normal(0, 1, (1000, 10))
eps = np.random.normal(0, 1, 1000)
y = coef_0 + X @ coefs + eps

In [27]:
boot_index = np.random.choice(np.arange(0, y.shape[0]), 500)
X_boot = X[boot_index]
y_boot = y[boot_index]

The elastic net model will be fit with equal weighting for the Lasso and the Ridge terms. All the models will use $\lambda = 1$

In [28]:
en_fit = ElasticNet().fit(X_boot, y_boot)
print(en_fit.coef_)

[-0.230375    0.27223301 -0.05457884  0.00877959 -0.06961766  0.
 -0.12426149  0.         -0.          0.        ]


In [29]:
lasso_fit = Lasso().fit(X_boot, y_boot)
ridge_fit = Lasso().fit(X_boot, y_boot)
print(lasso_fit.coef_ + en_fit.coef_)

[-0.230375    0.27223301 -0.05457884  0.00877959 -0.06961766  0.
 -0.12426149  0.         -0.          0.        ]


We can easily find weights for the Ridge and Lasso coefficients that give us the same result as the elastic net procedure. This is because elastic net can be see as adding two non-negative loss functions, such that the overall objective function can be separated into two smaller ones (Ridge and Lasso). This will mean that the optimum can be found by independently optimizing both functions separately, and the using the fact that we used a linear combination to find both the optimum value of the function and the parameters that reach that optimum.

# (d)

12. A continuous resonse algorithm could be fed the binary $Y$ as the outcome, and the resulting predictions would be continuous. However, there are two issues because of this. First of all, the predictions may not be bounded between 0 and 1, and we would have to add more post-processing steps for the response to be a useful prediction. The second issue is that, because of the first issue, we cannot have a probabilistic interpretation of the predictions.

14. The true positive rate is the proportion of positive observations that we predict correctly. The true negative rate is the proportion of negative observations that we predict correctly.

15. a. In this problem, sensitivity is the proportion of situations where the user clicks on the advertisement in which we correctly predict that they will click. Specificity measures the proportion of situations where the user does not click on the advertisement in which we correctly predict they will not click.

15. b. We would likely want to minimize the amount of false negatives, because they represent an unnecesary cost of serving the advertisement to users that will not click on it. Therefor, we would focus on improving specificity.

17. I would chose algorithm A. This is because it is able to reach very high sensitivity at the expense of a very low specificity. However because we are prioritizing sensitivity, we we can use a low threshold that may lead to a lower true negative rate, but a higher true positive rate too.

18. The condition

$$
\frac{1}{1+e^{-(b_0+b_1x_1+b_2x_2)}} = 0.5
$$

can be rewritten as

$$
1 = e^{-(b_0+b_1x_1+b_2x_2)}\text{.}
$$

By taking the $\log$ of both sides, we have that

$$
0=b_0+b_1x_1+b_2x_2\text{,}
$$

which implies that the threshold values in one feature depend linearly of the values of the other feature:

$$
x_1=-\frac{b_0+b_2x_2}{b_1}
$$

### (e)

#### Predicting diabetes status using NHANES

[DSLC stages]: Analysis



The following code sets up the libraries and creates cleaned and pre-processed training, validation and test data that we will use in this document.

In [30]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split

from functions.load_diabetes_data import load_diabetes_data
# load the diabetes data
diabetes = load_diabetes_data()

pd.set_option('display.max_columns', None)


Fill in this document to complete the diabetes prediction exercise.

In [31]:
diabetes.describe()

Unnamed: 0,diabetes,age,smoker,sex,coronary_heart_disease,weight,bmi,height,hypertension,heart_condition,cancer,family_history_diabetes
count,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0,32499.0
mean,0.107142,51.051017,1.608942,1.547494,0.055232,251.11385,3045.910736,68.817748,0.356042,0.089603,0.112865,0.354134
std,0.309298,18.51112,0.609761,0.497747,0.228437,240.280025,1451.552127,8.45921,0.478835,0.285616,0.316433,0.478257
min,0.0,18.0,1.0,1.0,0.0,100.0,1167.0,59.0,0.0,0.0,0.0,0.0
25%,0.0,35.0,1.0,1.0,0.0,150.0,2374.0,64.0,0.0,0.0,0.0,0.0
50%,0.0,52.0,2.0,2.0,0.0,176.0,2724.0,67.0,0.0,0.0,0.0,0.0
75%,0.0,66.0,2.0,2.0,0.0,214.0,3173.0,70.0,1.0,0.0,0.0,1.0
max,1.0,85.0,9.0,2.0,1.0,999.0,9999.0,99.0,1.0,1.0,1.0,1.0


In [32]:
def train_val_test_split(df):
        train, val = train_test_split(df, train_size=.8)
        val, test = train_test_split(val, train_size = .5)
        return train, val, test

In [33]:
train, val, test = train_val_test_split(diabetes)

In [34]:
def clean_data(df: pd.DataFrame, log_weight = False, log_bmi = False, log_height = False):
        df_copy = df.copy().drop(columns=["house_family_person_id"])

        train, val, test = train_val_test_split(df_copy)
        # smoker nan
        train[train["smoker"] >= 7] = np.nan
        val[val["smoker"] >= 7] = np.nan
        test[test["smoker"] >= 7] = np.nan
        # weight nan
        train[train["weight"] >= 996] = np.nan
        val[val["weight"] >= 996] = np.nan
        test[test["weight"] >= 996] = np.nan
        # bmi nan
        train[train["bmi"] == 9999] = np.nan
        val[val["bmi"] == 9999] = np.nan
        test[test["bmi"] == 9999] = np.nan
        # height nan
        train[train["height"] >= 96] = np.nan
        val[val["height"] >= 96] = np.nan
        test[test["height"] >= 96] = np.nan

        smoker_mode = train["smoker"].mode()
        weight_median = train["weight"].median()
        bmi_median = train["bmi"].median()
        height_median = train["height"].median()

        train["smoker"].fillna(smoker_mode, inplace=True)
        val["smoker"].fillna(smoker_mode, inplace=True)
        test["smoker"].fillna(smoker_mode, inplace=True)

        train["weight"].fillna(weight_median, inplace=True)
        val["weight"].fillna(weight_median, inplace=True)
        test["weight"].fillna(weight_median, inplace=True)

        train["bmi"].fillna(bmi_median, inplace=True)
        val["bmi"].fillna(bmi_median, inplace=True)
        test["bmi"].fillna(bmi_median, inplace=True)

        train["height"].fillna(height_median, inplace=True)
        val["height"].fillna(height_median, inplace=True)
        test["height"].fillna(height_median, inplace=True)

        train["smoker"] -= 1
        val["smoker"] -= 1
        test["smoker"] -= 1

        train["sex"] -= 1
        val["sex"] -= 1
        test["sex"] -= 1

        if log_weight:
                df_copy["weight"] = np.log(df_copy["weight"])
        
        if log_bmi:
                df_copy["bmi"] = np.log(df_copy["bmi"])
        
        if log_height:
                df_copy["height"] = np.log(df_copy["height"])
        
        if log_weight:
                train["weight"] = np.log(train["weight"])
                val["weight"] = np.log(val["weight"])
                test["weight"] = np.log(test["weight"])
        if log_bmi:
                train["bmi"] = np.log(train["bmi"])
                val["bmi"] = np.log(val["bmi"])
                test["bmi"] = np.log(test["bmi"])
        if log_height:
                train["height"] = np.log(train["height"])
                val["height"] = np.log(val["height"])
                test["height"] = np.log(test["height"])

        return train.dropna(), val.dropna(), test.dropna()

In [None]:
train, val, test = clean_data(diabetes)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

standardizer = StandardScaler()
X = standardizer.fit_transform(train.drop(columns=["diabetes"]))
y = train["diabetes"]

log_fit = LogisticRegression(penalty=None).fit(X, y)
tree_fit = DecisionTreeClassifier().fit(X, y)
rf_fit = RandomForestClassifier(n_jobs = -1).fit(X, y)

#### Fit Performance

In [37]:
X_val = standardizer.transform(val.drop(columns=["diabetes"]))
y_val = val["diabetes"]
log_pred = log_fit.predict_proba(X_val)[:, 1]
rf_pred = rf_fit.predict_proba(X_val)[:, 1]
tree_pred = tree_fit.predict_proba(X_val)[:, 1]

In [38]:
threshold = train["diabetes"].mean()

In [39]:
from sklearn import metrics

print(metrics.classification_report(y_true=y_val,
                                    y_pred=log_pred > threshold))

              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85      2632
         1.0       0.24      0.77      0.37       278

    accuracy                           0.75      2910
   macro avg       0.61      0.76      0.61      2910
weighted avg       0.90      0.75      0.80      2910



In [40]:
print(metrics.classification_report(y_true=y_val,
                                    y_pred=tree_pred > threshold))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.91      2632
         1.0       0.23      0.26      0.25       278

    accuracy                           0.85      2910
   macro avg       0.58      0.59      0.58      2910
weighted avg       0.86      0.85      0.85      2910



In [41]:
print(metrics.classification_report(y_true=y_val,
                                    y_pred=rf_pred > threshold))

              precision    recall  f1-score   support

         0.0       0.96      0.74      0.84      2632
         1.0       0.23      0.73      0.35       278

    accuracy                           0.74      2910
   macro avg       0.60      0.74      0.59      2910
weighted avg       0.89      0.74      0.79      2910



The logistic regression model is the best performing, followed closely by the Random Forest model. It is expected that the performance of the singular CART model would be the worst, as it tends to lead to overfitting.

#### Stability to perturbances

In [42]:
def perturb_diabetes(df):
    # create a copy of the data frame
    df_copy = df.copy()
    # generate a random number between -250 and 250 for 30% of the rows
    sampled_index = df_copy.sample(frac=0.3).index
    df_copy.loc[sampled_index, 'age'] = df_copy.loc[sampled_index, 'age'] + np.random.randint(-18, 18, size=sampled_index.size)
    # conduct bootstrap sample
    df_copy = df_copy.sample(frac=1, replace=True)
    return df_copy

In [43]:
perturbed_diabetes = [perturb_diabetes(train) for _ in range(100)]

In [44]:
def fit_models(df):
        X = standardizer.fit_transform(df.drop(columns=["diabetes"]))
        y = df["diabetes"]
        
        log_fit = LogisticRegression(penalty=None).fit(X, y)
        tree_fit = DecisionTreeClassifier().fit(X, y)
        rf_fit = RandomForestClassifier().fit(X, y)

        return log_fit, tree_fit, rf_fit

In [45]:
from joblib import Parallel, delayed

results = Parallel(n_jobs=-1)(delayed(fit_models)(df) for df in perturbed_diabetes)
log_perturbed, tree_perturbed, rf_perturbed = zip(*results)

In [46]:
log_val_pred_perturbed = [log.predict_proba(X_val)[:,1] for log in log_perturbed]
tree_val_pred_perturbed = [tree.predict_proba(X_val)[:,1] for tree in tree_perturbed]
rf_val_pred_perturbed = [rf.predict_proba(X_val)[:,1] for rf in rf_perturbed]

In [47]:

roc_curves_tree = []
roc_curves_log = []
roc_curves_rf = []

for i in range(100):
    log_fpr, log_tpr, log_thresholds = metrics.roc_curve(y_val, log_val_pred_perturbed[i])
    tree_fpr, tree_tpr, tree_thresholds = metrics.roc_curve(y_val, tree_val_pred_perturbed[i])
    rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(y_val, rf_val_pred_perturbed[i])

    roc_log = pd.DataFrame({
        'False Positive Rate': log_fpr,
        'True Positive Rate': log_tpr,
        'Model': f'Logistic Regression (Perturbed {i+1})'
    }, index=log_thresholds)

    roc_tree = pd.DataFrame({
        'False Positive Rate': tree_fpr,
        'True Positive Rate': tree_tpr,
        'Model': f'tree (Perturbed {i+1})'
    }, index=tree_thresholds)

    roc_rf = pd.DataFrame({
        'False Positive Rate': rf_fpr,
        'True Positive Rate': rf_tpr,
        'Model': f'tree (Perturbed {i+1})'
    }, index=rf_thresholds)

    roc_curves_tree.append(pd.concat([roc_tree]))
    roc_curves_log.append(pd.concat([roc_log]))
    roc_curves_rf.append(pd.concat([roc_rf]))

In [48]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_log):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(a) LR',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In [49]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_tree):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(b) Tree',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In [50]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_rf):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(c) Forest',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In this case, the Logistic Regression model is much more stable than the CART model, and somewhat more stable to perturbances in the data than the Random Forest model

#### Stability to Judgement Calls

In [51]:
from itertools import product

perturb_options = list(product([True, False], 
                               [True, False],
                               [True, False]))
perturb_options = pd.DataFrame(perturb_options, columns=('log_weight', 
                                                         'log_bmi',
                                                         'log_height'))
perturb_options

Unnamed: 0,log_weight,log_bmi,log_height
0,True,True,True
1,True,True,False
2,True,False,True
3,True,False,False
4,False,True,True
5,False,True,False
6,False,False,True
7,False,False,False


In [None]:
perturbed_dfs = [clean_data(diabetes,
                            log_weight=perturb_options["log_weight"][i],
                            log_bmi=perturb_options["log_bmi"][i],
                            log_height=perturb_options["log_height"][i])
                            for i in range(perturb_options.shape[0])]

train_pert, val_pert, test_pert = zip(*perturbed_dfs)

In [53]:
results_jc_perturbed = Parallel(n_jobs=-1)(delayed(fit_models)(df) for df in train_pert)
log_jc_perturbed, tree_jc_perturbed, rf_jc_perturbed = zip(*results_jc_perturbed)

In [None]:
log_val_pred_jc_perturbed = [log_jc_perturbed[i].predict_proba(perturb_jc_X.drop(columns=["diabetes"]))[:, 1] for i, perturb_jc_X in enumerate(val_pert)]
tree_val_pred_jc_perturbed = [tree_jc_perturbed[i].predict_proba(perturb_jc_X.drop(columns=["diabetes"]))[:, 1] for i, perturb_jc_X in enumerate(val_pert)]
rf_val_pred_jc_perturbed = [rf_jc_perturbed[i].predict_proba(perturb_jc_X.drop(columns=["diabetes"]))[:, 1] for i, perturb_jc_X in enumerate(val_pert)]

In [55]:
log_val_pred_binary_jc_perturbed = [log_val_pred_jc_perturbed[i] > threshold for i in range(len(log_jc_perturbed))]
tree_val_pred_binary_jc_perturbed = [tree_val_pred_jc_perturbed[i] > threshold for i in range(len(tree_jc_perturbed))]
rf_val_pred_binary_jc_perturbed = [rf_val_pred_jc_perturbed[i] > threshold for i in range(len(rf_jc_perturbed))]

In [56]:
roc_curves_tree = []
roc_curves_log = []
roc_curves_rf = []

for i in range(perturb_options.shape[0]):
    log_fpr, log_tpr, log_thresholds = metrics.roc_curve(val_pert[i]["diabetes"], log_val_pred_jc_perturbed[i])
    tree_fpr, tree_tpr, tree_thresholds = metrics.roc_curve(val_pert[i]["diabetes"], tree_val_pred_jc_perturbed[i])
    rf_fpr, rf_tpr, rf_thresholds = metrics.roc_curve(val_pert[i]["diabetes"], rf_val_pred_jc_perturbed[i])

    roc_log = pd.DataFrame({
        'False Positive Rate': log_fpr,
        'True Positive Rate': log_tpr,
        'Model': f'Logistic Regression (Perturbed {i+1})'
    }, index=log_thresholds)

    roc_tree = pd.DataFrame({
        'False Positive Rate': tree_fpr,
        'True Positive Rate': tree_tpr,
        'Model': f'tree (Perturbed {i+1})'
    }, index=tree_thresholds)

    roc_rf = pd.DataFrame({
        'False Positive Rate': rf_fpr,
        'True Positive Rate': rf_tpr,
        'Model': f'tree (Perturbed {i+1})'
    }, index=rf_thresholds)

    roc_curves_tree.append(pd.concat([roc_tree]))
    roc_curves_log.append(pd.concat([roc_log]))
    roc_curves_rf.append(pd.concat([roc_rf]))

In [57]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_log):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(a) LS',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In [58]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_tree):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(b) CART',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

In [59]:
fig = go.Figure()

for i, roc_curve in enumerate(roc_curves_rf):
    fig.add_trace(go.Scatter(
        x=roc_curve['False Positive Rate'],
        y=roc_curve['True Positive Rate'],
        mode='lines',
        name=f'ROC Curve {i+1}',
        showlegend=False  # Remove the legend for each trace
    ))

fig.update_layout(
    title='(c) RF',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate'
)

fig.show()

While there are some large changes in performance resulting from the data transformations, the Random Forest model seems to be the most stable one, remaining at least as good as random guessing. The other models are much more sensitive to changes in the pre-processing of the data