In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import GroupKFold, GridSearchCV, GroupShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load data from external CSV file. E.g., 

df = pd.read_csv(...)
print (df)

In [None]:
# ElasticNet modeling

In [None]:
# X is the input of predictor variables. Dummy names shown below instead of the real variable names.
# y is the outcome variable
# groups is the array indicating the hierarchical structure (e.g., state)

X = df[['var1','var2','etc']]  
y = df[['outcome']]
groups = df[['STATE']]

# Convert groups to numeric representation using LabelEncoder
le = LabelEncoder()
groups_numeric = le.fit_transform(groups)

# Create a GroupShuffleSplit object to perform the train-test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=16)

# Perform the train-test split while maintaining the hierarchical structure and blocked data
train_index, test_index = next(gss.split(X, y, groups_numeric))

if isinstance(X, pd.DataFrame):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
else:
    X_train, X_test = X[train_index], X[test_index]

y_train, y_test = y.iloc[train_index], y.iloc[test_index]
groups_train, groups_test = groups_numeric[train_index], groups_numeric[test_index]

# Create a GroupKFold object to handle hierarchical data and blocked data
gkf = GroupKFold(n_splits=5)

# Define the parameter grid for hyperparameter tuning. E.g.,
param_grid = {'alpha': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'l1_ratio': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1.0]}

# Create an ElasticNet regression model
elastic_net = ElasticNet()

# Perform grid search with nested cross-validation for hierarchical data
grid_search = GridSearchCV(estimator=elastic_net, param_grid=param_grid, cv=gkf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train, groups=groups_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best parameters: ", best_params)

# Evaluate the model using nested cross-validation
cv_scores = -grid_search.cv_results_['mean_test_score']
mse_scores = np.sqrt(cv_scores)

print("Nested cross-validation MSE scores: ", mse_scores)
print("Mean nested cross-validation MSE: ", np.mean(mse_scores))

# Evaluate the best model on the unseen outter loop test set
best_model.fit(X_train, y_train)
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Calculate MSE for the test set
test_mse = mean_squared_error(y_test, y_pred_test, squared=False)
print("Test set MSE: ", test_mse)

# Calculate RMSE for the test set
test_rmse = np.sqrt(test_mse)
print("Test set RMSE: ", test_rmse)

# Get the final feature importances from the best model
importances = best_model.coef_

# Create a dataframe to store the feature importances
if isinstance(X, pd.DataFrame):
    feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': np.abs(importances)})
else:
    feature_importances = pd.DataFrame({'Feature': range(X.shape[1]), 'Importance': np.abs(importances)})

# Sort the feature importances in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Display the top 10 important predictors and their absolute values
print("Top 10 Important Predictors:")
print(feature_importances.head(50))

In [None]:
#Non-regularized modeling

In [None]:
# X is the input of predictor variables. Dummy names shown below instead of the real variable names.
# y is the outcome variable
# groups is the array indicating the hierarchical structure (e.g., state)

X = df[['var1','var2','etc']]  
y = df[['outcome']]
groups = df[['STATE']]

le = LabelEncoder()
groups_numeric = le.fit_transform(groups)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=16)

train_index, test_index = next(gss.split(X, y, groups_numeric))

if isinstance(X, pd.DataFrame):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
else:
    X_train, X_test = X[train_index], X[test_index]

y_train, y_test = y.iloc[train_index], y.iloc[test_index]
groups_train, groups_test = groups_numeric[train_index], groups_numeric[test_index]

gkf = GroupKFold(n_splits=5)

linear_model = LinearRegression()

cv_scores = []

for train_idx, test_idx in gkf.split(X_train, y_train, groups_train):
    X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    linear_model.fit(X_train_fold, y_train_fold)
    y_pred_fold = linear_model.predict(X_test_fold)
    mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
    cv_scores.append(mse_fold)

print("Cross-validation MSE scores: ", cv_scores)
print("Mean cross-validation MSE: ", np.mean(cv_scores))

linear_model.fit(X_train, y_train)

y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred_test)
print("Test set MSE: ", test_mse)

test_rmse = np.sqrt(test_mse)
print("Test set RMSE: ", test_rmse)

In [None]:
#Dummy regressor modeling

In [None]:
# X is the input of predictor variables. Dummy names shown below instead of the real variable names.
# y is the outcome variable
# groups is the array indicating the hierarchical structure (e.g., state)

X = df[['var1','var2','etc']]  
y = df[['outcome']]
groups = df[['STATE']]

le = LabelEncoder()
groups_numeric = le.fit_transform(groups)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=16)

train_index, test_index = next(gss.split(X, y, groups_numeric))

if isinstance(X, pd.DataFrame):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
else:
    X_train, X_test = X[train_index], X[test_index]

y_train, y_test = y.iloc[train_index], y.iloc[test_index]
groups_train, groups_test = groups_numeric[train_index], groups_numeric[test_index]

gkf = GroupKFold(n_splits=5)

dummy_model = DummyRegressor(strategy='mean')

cv_scores = []

for train_idx, test_idx in gkf.split(X_train, y_train, groups_train):
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    dummy_model.fit(np.zeros((len(y_train_fold), 1)), y_train_fold)
    y_pred_fold = dummy_model.predict(np.zeros((len(y_test_fold), 1)))
    mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
    cv_scores.append(mse_fold)

print("Cross-validation MSE scores: ", cv_scores)
print("Mean cross-validation MSE: ", np.mean(cv_scores))

dummy_model.fit(np.zeros((len(y_train), 1)), y_train)

y_pred_train = dummy_model.predict(np.zeros((len(y_train), 1)))
y_pred_test = dummy_model.predict(np.zeros((len(y_test), 1)))

test_mse = mean_squared_error(y_test, y_pred_test)
print("Test set MSE: ", test_mse)

test_rmse = np.sqrt(test_mse)
print("Test set RMSE: ", test_rmse)