In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('target1.csv')

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


**Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe**

In [6]:
df['sepal_length'].fillna(df['sepal_length'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sepal_length'].fillna(df['sepal_length'].mean(), inplace=True)


**3) Compute feature reduction based on input. See the screenshot below where there can be No Reduction, Corr with Target, Tree-based, PCA. Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No Reduction to say PCA.**

In [7]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression, f_classif

def apply_feature_reduction(X, y, config):
    """
    Apply feature reduction based on the provided configuration.

    Parameters:
    X (pd.DataFrame): Input features
    y (pd.Series): Target variable
    config (dict): Feature reduction configuration

    Returns:
    pd.DataFrame: Reduced feature set
    dict: Information about the feature reduction process
    """
    method = config["feature_reduction_method"]
    method_config = config[method]
    info = {"method": method}

    if not method_config["is_selected"]:
        return X, {"method": "None", "reason": "Selected method is not enabled in config"}

    if method == "No Reduction":
        # Simply select top N features (no actual reduction, just filtering)
        num_features = min(method_config["num_of_features_to_keep"], X.shape[1])
        selected_features = X.columns[:num_features]
        reduced_X = X[selected_features]
        info["num_features_kept"] = num_features
        info["selected_features"] = list(selected_features)

    elif method == "Correlation with target":
        # Select features based on correlation with target
        num_features = method_config["num_of_features_to_keep"]
        if num_features <= 0:
            return X, {"method": method, "reason": "num_of_features_to_keep must be positive"}

        # Determine if regression or classification problem
        problem_type = 'regression' if pd.api.types.is_numeric_dtype(y) else 'classification'
        score_func = f_regression if problem_type == 'regression' else f_classif

        selector = SelectKBest(score_func=score_func, k=num_features)
        selector.fit(X, y)

        selected_features = X.columns[selector.get_support()]
        reduced_X = X[selected_features]
        info["num_features_kept"] = num_features
        info["selected_features"] = list(selected_features)
        info["scores"] = selector.scores_.tolist()
        info["pvalues"] = selector.pvalues_.tolist()

    elif method == "Tree-based":
        # Select features using tree-based importance
        num_features = method_config["num_of_features_to_keep"]
        depth = method_config["depth_of_trees"]
        num_trees = method_config["num_of_trees"]

        if num_features <= 0:
            return X, {"method": method, "reason": "num_of_features_to_keep must be positive"}

        # Determine if regression or classification problem
        problem_type = 'regression' if pd.api.types.is_numeric_dtype(y) else 'classification'

        if problem_type == 'regression':
            model = RandomForestRegressor(
                n_estimators=num_trees,
                max_depth=depth,
                random_state=42
            )
        else:
            model = RandomForestClassifier(
                n_estimators=num_trees,
                max_depth=depth,
                random_state=42
            )

        model.fit(X, y)
        importances = model.feature_importances_
        indices = np.argsort(importances)[-num_features:]

        selected_features = X.columns[indices]
        reduced_X = X[selected_features]
        info["num_features_kept"] = num_features
        info["selected_features"] = list(selected_features)
        info["feature_importances"] = importances.tolist()
        info["model_params"] = model.get_params()

    elif method == "Principal Component Analysis":
        # Apply PCA
        num_features = method_config["num_of_features_to_keep"]

        if num_features <= 0:
            return X, {"method": method, "reason": "num_of_features_to_keep must be positive"}

        pca = PCA(n_components=num_features)
        reduced_X = pd.DataFrame(
            pca.fit_transform(X),
            columns=[f"PC_{i+1}" for i in range(num_features)],
            index=X.index
        )
        info["num_features_kept"] = num_features
        info["explained_variance_ratio"] = pca.explained_variance_ratio_.tolist()
        info["components"] = pca.components_.tolist()

    else:
        raise ValueError(f"Unknown feature reduction method: {method}")

    return reduced_X, info

# Example usage:
config = {
    "feature_reduction_method": "No Reduction",
    "No Reduction": {
        "is_selected": True,
        "num_of_features_to_keep": 5
    },
    "Correlation with target": {
        "is_selected": False,
        "num_of_features_to_keep": 0
    },
    "Tree-based": {
        "is_selected": False,
        "num_of_features_to_keep": 0,
        "depth_of_trees": 0,
        "num_of_trees": 0
    },
    "Principal Component Analysis": {
        "is_selected": False,
        "num_of_features_to_keep": 0
    }
}

# Assuming X is your feature DataFrame and y is your target Series
# reduced_X, reduction_info = apply_feature_reduction(X, y, config)


**4) Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON (See #1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type.**

In [8]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def create_logistic_regression_model(config, prediction_type):
    """
    Create a LogisticRegression model based on the provided configuration and prediction type.

    Parameters:
    config (dict): Configuration dictionary for Logistic Regression
    prediction_type (str): Either "classification" or "regression"

    Returns:
    tuple: (model object, message) - message indicates any warnings or adjustments made
    """
    if not config["is_selected"]:
        return None, "Model is not selected in configuration"

    if prediction_type != "classification":
        return None, f"LogisticRegression is not suitable for {prediction_type} (only for classification)"

    # Extract parameters from config with defaults
    params = {
        "n_jobs": config.get("parallelism", 1),
        "max_iter": np.random.randint(config.get("min_iter", 30),
                                     config.get("max_iter", 50)),
        "C": np.random.uniform(config.get("min_reparam", 0.5),
                               config.get("max_reparam", 0.8)),
        "l1_ratio": np.random.uniform(config.get("min_elasticnet", 0.5),
                                     config.get("max_elasticnet", 0.8)),
        "solver": 'saga',  # Only solver that supports elasticnet
        "penalty": 'elasticnet',
        "random_state": 42
    }

    try:
        model = LogisticRegression(**params)
        message = "LogisticRegression model created successfully"
        return model, message
    except Exception as e:
        return None, f"Error creating LogisticRegression model: {str(e)}"

# Example usage:
logistic_config = {
    "model_name": "LogisticRegression",
    "is_selected": True,
    "parallelism": 2,
    "min_iter": 30,
    "max_iter": 50,
    "min_reparam": 0.5,
    "max_reparam": 0.8,
    "min_elasticnet": 0.5,
    "max_elasticnet": 0.8
}

# For classification problem
model, message = create_logistic_regression_model(logistic_config, "classification")
print(message)
if model:
    print(model.get_params())

# For regression problem (will return None)
model, message = create_logistic_regression_model(logistic_config, "regression")
print(message)


LogisticRegression model created successfully
{'C': 0.6631697308861249, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': 0.6713760996167615, 'max_iter': 36, 'multi_class': 'deprecated', 'n_jobs': 2, 'penalty': 'elasticnet', 'random_state': 42, 'solver': 'saga', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
LogisticRegression is not suitable for regression (only for classification)



**Run the fit and predict on each model – keep in mind that you need to do hyper parameter tuning i.e., use GridSearchCV**


Parses the model configuration (including hyperparameter ranges)

Performs hyperparameter tuning using GridSearchCV

Fits the best model

Makes predictions

Handles both classification and regression tasks appropriate.

In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

def create_and_tune_logistic_regression(X_train, y_train, X_test, y_test, config, prediction_type):
    """
    Creates, tunes, fits, and evaluates a LogisticRegression model using GridSearchCV

    Parameters:
    X_train, y_train: Training data
    X_test, y_test: Test data (can be None if just want to fit)
    config: Model configuration dictionary
    prediction_type: 'classification' or 'regression'

    Returns:
    dict: Contains best model, predictions, scores, and tuning information
    """
    results = {
        'model_name': 'LogisticRegression',
        'best_model': None,
        'best_params': None,
        'best_score': None,
        'predictions': None,
        'test_score': None,
        'warning': None
    }

    # Validate prediction type
    if prediction_type != 'classification':
        results['warning'] = f"LogisticRegression not suitable for {prediction_type}, only classification"
        return results

    if not config['is_selected']:
        results['warning'] = "Model not selected in configuration"
        return results

    # Standardize features (important for regularized models)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    if X_test is not None:
        X_test_scaled = scaler.transform(X_test)

    # Create parameter grid for GridSearchCV
    param_grid = {
        'C': np.linspace(config['min_reparam'], config['max_reparam'], 5),
        'l1_ratio': np.linspace(config['min_elasticnet'], config['max_elasticnet'], 5),
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'max_iter': [config['min_iter'], config['max_iter']],
        'n_jobs': [config['parallelism']]
    }

    # Create and tune model
    try:
        grid_search = GridSearchCV(
            estimator=LogisticRegression(random_state=42),
            param_grid=param_grid,
            cv=5,
            scoring='accuracy',
            n_jobs=config['parallelism']
        )

        grid_search.fit(X_train_scaled, y_train)

        # Store results
        results['best_model'] = grid_search.best_estimator_
        results['best_params'] = grid_search.best_params_
        results['best_score'] = grid_search.best_score_

        # Make predictions if test data provided
        if X_test is not None and y_test is not None:
            y_pred = grid_search.predict(X_test_scaled)
            results['predictions'] = y_pred
            results['test_score'] = accuracy_score(y_test, y_pred)

    except Exception as e:
        results['warning'] = f"Model training failed: {str(e)}"

    return results

# Example usage:
logistic_config = {
    "model_name": "LogisticRegression",
    "is_selected": True,
    "parallelism": 2,
    "min_iter": 30,
    "max_iter": 50,
    "min_reparam": 0.5,
    "max_reparam": 0.8,
    "min_elasticnet": 0.5,
    "max_elasticnet": 0.8
}

# Assuming we have some data
# from sklearn.datasets import load_iris
# data = load_iris()
# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)

# For demonstration, we'll create dummy data
np.random.seed(42)
X_train = np.random.rand(100, 4)
y_train = np.random.randint(0, 2, 100)
X_test = np.random.rand(20, 4)
y_test = np.random.randint(0, 2, 20)

# Run the tuning and prediction
results = create_and_tune_logistic_regression(
    X_train, y_train,
    X_test, y_test,
    logistic_config,
    'classification'
)

print("Best parameters:", results['best_params'])
print("CV accuracy:", results['best_score'])
print("Test accuracy:", results['test_score'])
print("Predictions:", results['predictions'])
if results['warning']:
    print("Warning:", results['warning'])



Best parameters: {'C': np.float64(0.5), 'l1_ratio': np.float64(0.7250000000000001), 'max_iter': 30, 'n_jobs': 2, 'penalty': 'elasticnet', 'solver': 'saga'}
CV accuracy: 0.5199999999999999
Test accuracy: 0.75
Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


**6) Log to the console the standard model metrics that apply**

In [10]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            mean_squared_error, mean_absolute_error, r2_score)
from sklearn.preprocessing import StandardScaler, LabelBinarizer
import sys

def log_metrics(y_true, y_pred, y_proba=None, prediction_type='classification'):
    """Log appropriate metrics based on prediction type"""
    print("\n" + "="*50)
    print("MODEL EVALUATION METRICS")
    print("="*50)

    if prediction_type == 'classification':
        # Classification metrics
        print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
        print(f"Precision (macro avg): {precision_score(y_true, y_pred, average='macro'):.4f}")
        print(f"Recall (macro avg): {recall_score(y_true, y_pred, average='macro'):.4f}")
        print(f"F1 Score (macro avg): {f1_score(y_true, y_pred, average='macro'):.4f}")

        if y_proba is not None:
            # Handle binary and   multiclass ROC AUC
            if len(np.unique(y_true)) == 2:
                print(f"ROC AUC: {roc_auc_score(y_true, y_proba[:, 1]):.4f}")
            else:
                # One-vs-rest approach for multiclass
                lb = LabelBinarizer()
                y_true_bin = lb.fit_transform(y_true)
                print(f"ROC AUC (ovr): {roc_auc_score(y_true_bin, y_proba, multi_class='ovr'):.4f}")

        # Confusion matrix
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_true, y_pred))

    else:
        # Regression metrics
        print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred):.4f}")
        print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
        print(f"Mean Absolute Error: {mean_absolute_error(y_true, y_pred):.4f}")
        print(f"R² Score: {r2_score(y_true, y_pred):.4f}")

    print("="*50 + "\n")

def create_and_tune_logistic_regression(X_train, y_train, X_test, y_test, config, prediction_type):
    """
    Creates, tunes, fits, and evaluates a LogisticRegression model using GridSearchCV
    with comprehensive metric logging
    """
    results = {
        'model_name': 'LogisticRegression',
        'best_model': None,
        'best_params': None,
        'best_score': None,
        'predictions': None,
        'test_score': None,
        'warning': None
    }

    # Checking prediction type
    if prediction_type != 'classification':
        results['warning'] = f"LogisticRegression not suitable for {prediction_type}, only classification"
        print(results['warning'])
        return results

    if not config['is_selected']:
        results['warning'] = "Model not selected in configuration"
        print(results['warning'])
        return results

    # Standardize features
    print("\nPreprocessing: Standardizing features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    if X_test is not None:
        X_test_scaled = scaler.transform(X_test)

    # Create parameter grid
    param_grid = {
        'C': np.linspace(config['min_reparam'], config['max_reparam'], 5),
        'l1_ratio': np.linspace(config['min_elasticnet'], config['max_elasticnet'], 5),
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'max_iter': [config['min_iter'], config['max_iter']],
        'n_jobs': [config['parallelism']],
        'random_state': [42]
    }

    print("\nStarting GridSearchCV for Logistic Regression...")
    print(f"Parameter grid: {param_grid}")

    try:
        grid_search = GridSearchCV(
            estimator=LogisticRegression(),
            param_grid=param_grid,
            cv=5,
            scoring='accuracy',
            n_jobs=config['parallelism'],
            verbose=1
        )

        print("\nFitting model...")
        grid_search.fit(X_train_scaled, y_train)

        # Store results
        results['best_model'] = grid_search.best_estimator_
        results['best_params'] = grid_search.best_params_
        results['best_score'] = grid_search.best_score_

        print("\nBest parameters found:")
        for param, value in grid_search.best_params_.items():
            print(f"{param}: {value}")
        print(f"Cross-validated accuracy: {grid_search.best_score_:.4f}")

        # Make predictions if test data provided
        if X_test is not None and y_test is not None:
            print("\nEvaluating on test set...")
            y_pred = grid_search.predict(X_test_scaled)
            y_proba = grid_search.predict_proba(X_test_scaled) if hasattr(grid_search, 'predict_proba') else None

            results['predictions'] = y_pred
            results['test_score'] = accuracy_score(y_test, y_pred)

            # Log metrics
            log_metrics(y_test, y_pred, y_proba, prediction_type)

    except Exception as e:
        results['warning'] = f"Model training failed: {str(e)}"
        print(results['warning'], file=sys.stderr)

    return results

# Example usage
if __name__ == "__main__":
    logistic_config = {
        "model_name": "LogisticRegression",
        "is_selected": True,
        "parallelism": 2,
        "min_iter": 30,
        "max_iter": 50,
        "min_reparam": 0.5,
        "max_reparam": 0.8,
        "min_elasticnet": 0.5,
        "max_elasticnet": 0.8
    }

    # Create sample classification data
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split

    X, y = make_classification(n_samples=1000, n_features=20, n_classes=3, n_informative=10, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("="*50)
    print("STARTING LOGISTIC REGRESSION PIPELINE")
    print("="*50)

    results = create_and_tune_logistic_regression(
        X_train, y_train,
        X_test, y_test,
        logistic_config,
        'classification'
    )



STARTING LOGISTIC REGRESSION PIPELINE

Preprocessing: Standardizing features...

Starting GridSearchCV for Logistic Regression...
Parameter grid: {'C': array([0.5  , 0.575, 0.65 , 0.725, 0.8  ]), 'l1_ratio': array([0.5  , 0.575, 0.65 , 0.725, 0.8  ]), 'penalty': ['elasticnet'], 'solver': ['saga'], 'max_iter': [30, 50], 'n_jobs': [2], 'random_state': [42]}

Fitting model...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best parameters found:
C: 0.5
l1_ratio: 0.575
max_iter: 30
n_jobs: 2
penalty: elasticnet
random_state: 42
solver: saga
Cross-validated accuracy: 0.6700

Evaluating on test set...

MODEL EVALUATION METRICS
Accuracy: 0.7400
Precision (macro avg): 0.7422
Recall (macro avg): 0.7392
F1 Score (macro avg): 0.7383
ROC AUC (ovr): 0.9003

Confusion Matrix:
[[42 10  9]
 [16 61  4]
 [13  0 45]]



