In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from scipy.stats import pearsonr
import multiprocessing
import warnings
warnings.filterwarnings('ignore')

def run_elastic_net_analysis(X_train, X_test, y_train, y_test, 
                           random_state=123):
    """
    Run elastic net regression analysis predicting p-factor
    
    Parameters:
    -----------
    X_train : pandas DataFrame or numpy array
        Training features
    X_test : pandas DataFrame or numpy array  
        Test features
    y_train : pandas Series or numpy array
        Training target (p-factor)
    y_test : pandas Series or numpy array
        Test target (p-factor)
    random_state : int
        Random seed for reproducibility (like set.seed() in R)
    
    Returns:
    --------
    dict : Results dictionary with model, predictions, and metrics
    """
    
    # Set up parallel processing (2 fewer cores than max)
    num_cores = multiprocessing.cpu_count() - 2
    
    # Define hyperparameter grid
    alpha_range = [0, 0.2, 0.5, 0.8, 1.0]  # Mix ratio: Ridge to Lasso
    lambda_range = [0.01, 0.1, 1.0, 10.0]  # Penalty strength on log scale
    
    elastic_grid = {
        'alpha': lambda_range,    # Penalty strength
        'l1_ratio': alpha_range   # Mix ratio
    }
    
    # Set up cross-validation (5 folds, 10 repeats)
    fit_control = RepeatedKFold(n_splits=5, n_repeats=10, random_state=random_state)
    
    # Initialize ElasticNet
    fit_elastic = ElasticNet(max_iter=2000, random_state=random_state)
    
    # Grid search with cross-validation
    print("Running grid search with cross-validation...")
    grid_search = GridSearchCV(
        estimator=fit_elastic,
        param_grid=elastic_grid,
        cv=fit_control,
        scoring='r2',
        n_jobs=num_cores,
        verbose=1
    )
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Get best model and parameters
    best_model = grid_search.best_estimator_
    best_tune = grid_search.best_params_
    
    print(f"Best model parameters: {best_tune}")
    print(f"Best CV R-squared: {grid_search.best_score_}")
    
    # Evaluate model performance on the held-out test set
    predicted_testdata = best_model.predict(X_test)
    
    # Calculate final metrics
    corr_final = pearsonr(predicted_testdata, y_test)[0]
    R_squared_final = r2_score(y_test, predicted_testdata)
    
    # Print results
    print("\nFINAL TEST SET RESULTS")
    print(f"Correlation Coefficient R: {corr_final}")
    print(f"R-squared: {R_squared_final}")
    
    # Return results dictionary
    results = {
        'fit_elastic': best_model,
        'best_tune': best_tune,
        'best_cv_score': grid_search.best_score_,
        'predicted_testdata': predicted_testdata,
        'corr_final': corr_final,
        'R_squared_final': R_squared_final,
        'grid_search': grid_search
    }
    return results