In [21]:
ls -lt $(conda info --base)/envs/myenv/lib/python*/site-packages/notebook

658.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


zsh:1: no matches found: /Users/derekdewald/anaconda3/envs/myenv/lib/python*/site-packages/notebook


In [None]:
def get_param_grid(model_name):
    """
    Automatically assigns reasonable hyperparameter ranges based on model name.
    """
    grids = {
        "RandomForestRegressor": {
            "n_estimators": [10, 50, 100],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        },
        "LinearRegression": {
            "fit_intercept": [True, False]
        },
        "DecisionTreeRegressor": {
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10]
        },
        "SVR": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "poly", "rbf"]
        },
        "GradientBoostingRegressor": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 10]
        },
        "KNeighborsRegressor": {
            "n_neighbors": [3, 5, 10],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        },
        "Ridge": {
            "alpha": [0.1, 1, 10, 100]
        },
        "Lasso": {
            "alpha": [0.01, 0.1, 1, 10]
        },
        "ElasticNet": {
            "alpha": [0.01, 0.1, 1, 10],
            "l1_ratio": [0.1, 0.5, 0.9]
        }
    }
    
    return grids.get(model_name, None)  # Returns None if model not found

In [None]:
import inspect


def run_model(model_class, X_train, X_test, y_train, y_test, model_name, ml_model_type, **kwargs):
    """
    Initializes and runs a given model with only the relevant hyperparameters.
    
    Args:
        model_class: The ML model class to be instantiated.
        X_train, X_test, y_train, y_test: Train-test datasets.
        model_name (str): Name of the model.
        ml_model_type (str): "classifier" or "regressor".
        **kwargs: All potential hyperparameters. Only relevant ones will be passed.

    Returns:
        dict: A dictionary containing model name, metric, runtime, and best parameters.
    """
    start_time = time.time()

    # Get only the relevant parameters for this model
    model_params = inspect.signature(model_class).parameters
    filtered_kwargs = {k: v for k, v in kwargs.items() if k in model_params}

    # Initialize model with relevant hyperparameters
    model = model_class(**filtered_kwargs)

    # Train model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate performance
    if ml_model_type == "classifier":
        metric = accuracy_score(y_test, y_pred)
    else:
        metric = mean_squared_error(y_test, y_pred) ** 0.5  # RMSE

    # Log everything in MLflow
    with mlflow.start_run(run_name=model_name):
        mlflow.log_metric("RMSE" if ml_model_type == "regressor" else "Accuracy", metric)
        mlflow.log_params(filtered_kwargs)
        mlflow.sklearn.log_model(model, "model")

    return {
        "Model": model_name,
        "Metric": metric,
        "Time (s)": round(time.time() - start_time, 2),
        "Used Params": filtered_kwargs
    }

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Example model and hyperparameter search
model = RandomForestClassifier()
param_grid = {'n_estimators': [10, 50, 100]}

grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Extract the chosen model's class path
chosen_model_class = str(type(grid_search.best_estimator_))
print(f"Chosen Model: {chosen_model_class}")

# Match it with the primary key in our DataFrame
df_estimators = GenerateSKLearnModelList()
selected_model_info = df_estimators[df_estimators['Primary Key'] == chosen_model_class]
print(selected_model_info)


Unnamed: 0,Model Name,Estimator Class,Part_2,Part_3,Part_4,Part_5
0,ARDRegression,<class 'sklearn.linear_model._bayes.ARDRegress...,linear_model,_bayes,ARDRegression'>,
1,AdaBoostClassifier,<class 'sklearn.ensemble._weight_boosting.AdaB...,ensemble,_weight_boosting,AdaBoostClassifier'>,
2,AdaBoostRegressor,<class 'sklearn.ensemble._weight_boosting.AdaB...,ensemble,_weight_boosting,AdaBoostRegressor'>,
3,AdditiveChi2Sampler,<class 'sklearn.kernel_approximation.AdditiveC...,kernel_approximation,AdditiveChi2Sampler'>,,
4,AffinityPropagation,<class 'sklearn.cluster._affinity_propagation....,cluster,_affinity_propagation,AffinityPropagation'>,
...,...,...,...,...,...,...
202,TunedThresholdClassifierCV,<class 'sklearn.model_selection._classificatio...,model_selection,_classification_threshold,TunedThresholdClassifierCV'>,
203,TweedieRegressor,<class 'sklearn.linear_model._glm.glm.TweedieR...,linear_model,_glm,glm,TweedieRegressor'>
204,VarianceThreshold,<class 'sklearn.feature_selection._variance_th...,feature_selection,_variance_threshold,VarianceThreshold'>,
205,VotingClassifier,<class 'sklearn.ensemble._voting.VotingClassif...,ensemble,_voting,VotingClassifier'>,


In [34]:
import pandas as pd
import numpy as np

import sys
sys.path.append("/Users/derekdewald/Documents/Python/Github_Repo/d_py_functions")

from MLPipeline import apply_scaling


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [65]:
all_estimators

<function sklearn.utils.discovery.all_estimators(type_filter=None)>

In [49]:
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
import time

from sklearn import datasets
diabetes = datasets.load_diabetes()
df = pd.DataFrame(diabetes['data'],columns=diabetes['feature_names'])
df = pd.concat([df,pd.DataFrame(diabetes['target'],columns=['Target'])],axis=1)
df.head()

def MLPipeline(df, 
               project_name,
               scaler,
               ml_model_type='regressor',
               target_column='Target',
               test_size=0.2):
    """
    Runs multiple ML algorithms, tracks results with MLflow, and saves models.

    Args:
        df (dataframe)
        project_name (str):
        scaler (str): None, normal,standard
        ml_model_type (str): Option to pronpt all_estimators as to what model type requested.
        classifier, regressor, cluster, transformer

    Returns:
        None (Results are logged in MLflow)
    """
    # Set up MLflow experiment
    mlflow.set_experiment(project_name)

    # Prepare data
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Apply Scaler as Necessary
    X_train, X_test = apply_scaling(X_train, X_test, scaler=scaler)
    
    # Get all available models
    model_list = all_estimators(type_filter=ml_model_type)
    
    results = []
    
    for name, model_class in model_list:
        try:
            model = model_class()
            start_time = time.time()

            with mlflow.start_run(run_name=name):  # Start MLflow run
                model.fit(X_train, y_train)  # Train model
                y_pred = model.predict(X_test)  # Predict

                # Evaluate performance
                if ml_model_type == "classifier":
                    metric = accuracy_score(y_test, y_pred)
                    mlflow.log_metric("Accuracy", metric)
                else:
                    metric = mean_squared_error(y_test, y_pred) ** 0.5  # RMSE manually computed
                    mlflow.log_metric("RMSE", metric)

                # Log model
                mlflow.sklearn.log_model(model, name)

                # Log metadata
                mlflow.log_param("Model", name)
                mlflow.log_param("Training Time", round(time.time() - start_time, 2))

                # Append results
                results.append({
                    "Model": name,
                    "Metric": metric,
                    "Time (s)": round(time.time() - start_time, 2)
                })

        except Exception as e:
            print(f"{name} failed: {str(e)}")  # Handle errors but continue

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df#.sort_values(by="Metric", ascending=(task_type == "regression"))

    return results_df
    
results_df = MLPipeline(df,'Test',None)



⚠️ CCA failed: `n_components` upper bound is 1. Got 2 instead. Reduce `n_components`.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


⚠️ IsotonicRegression failed: Isotonic regression input X should be a 1d array or 2d array with 1 feature




⚠️ MultiOutputRegressor failed: MultiOutputRegressor.__init__() missing 1 required positional argument: 'estimator'
⚠️ MultiTaskElasticNet failed: For mono-task outputs, use ElasticNet
⚠️ MultiTaskElasticNetCV failed: For mono-task outputs, use ElasticNetCVCV
⚠️ MultiTaskLasso failed: For mono-task outputs, use ElasticNet
⚠️ MultiTaskLassoCV failed: For mono-task outputs, use LassoCVCV




⚠️ PLSCanonical failed: `n_components` upper bound is 1. Got 2 instead. Reduce `n_components`.




⚠️ RegressorChain failed: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'




⚠️ StackingRegressor failed: StackingRegressor.__init__() missing 1 required positional argument: 'estimators'




⚠️ VotingRegressor failed: VotingRegressor.__init__() missing 1 required positional argument: 'estimators'


In [51]:
mlflow.__version__

'2.18.0'

In [56]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

experiments = client.search_experiments()  # Alternative method
for exp in experiments:
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")

Experiment ID: 981125948236662726, Name: Test


In [60]:
import mlflow
import pandas as pd

experiment_id = "981125948236662726"  # Replace with your actual experiment ID

runs_df = mlflow.search_runs(experiment_ids=[experiment_id])
pd.set_option('display.max_columns', None)  # Show all columns
runs_df


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.RMSE,metrics.Accuracy,params.Training Time,params.Model,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.runName,tags.mlflow.source.type
0,688051793a514e22883997274b642911,981125948236662726,FINISHED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 07:00:11.929000+00:00,2025-02-25 07:00:12.863000+00:00,72.966869,,0.93,TweedieRegressor,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,"[{""run_id"": ""688051793a514e22883997274b642911""...",TweedieRegressor,LOCAL
1,8c89dcf05eb84423b5373e76350e9416,981125948236662726,FINISHED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 07:00:11.001000+00:00,2025-02-25 07:00:11.928000+00:00,53.853446,,0.93,TransformedTargetRegressor,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,"[{""run_id"": ""8c89dcf05eb84423b5373e76350e9416""...",TransformedTargetRegressor,LOCAL
2,10df54fddafa4d47b1b5496b8443d934,981125948236662726,FINISHED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 07:00:09.875000+00:00,2025-02-25 07:00:11+00:00,53.486031,,1.12,TheilSenRegressor,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,"[{""run_id"": ""10df54fddafa4d47b1b5496b8443d934""...",TheilSenRegressor,LOCAL
3,5f9960bf2aac4ee98f5ceabbe438d2a5,981125948236662726,FINISHED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 07:00:08.944000+00:00,2025-02-25 07:00:09.874000+00:00,65.827699,,0.93,SVR,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,"[{""run_id"": ""5f9960bf2aac4ee98f5ceabbe438d2a5""...",SVR,LOCAL
4,210e9b03e0d840328bda9c3c7464104a,981125948236662726,FINISHED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 07:00:08.027000+00:00,2025-02-25 07:00:08.943000+00:00,55.783767,,0.92,SGDRegressor,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,"[{""run_id"": ""210e9b03e0d840328bda9c3c7464104a""...",SGDRegressor,LOCAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,ed3417208a6d469fbc7c5b37cd3bf1ba,981125948236662726,FAILED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 06:46:46.506000+00:00,2025-02-25 06:46:46.509000+00:00,,,,,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,,CCA,LOCAL
302,8ad19c93e6cb4252936e5d6552410512,981125948236662726,FAILED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 06:46:46.500000+00:00,2025-02-25 06:46:46.505000+00:00,,,,,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,,BayesianRidge,LOCAL
303,1d051b119f5d4796ad15f3124663ba2b,981125948236662726,FAILED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 06:46:46.480000+00:00,2025-02-25 06:46:46.499000+00:00,,,,,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,,BaggingRegressor,LOCAL
304,3b29e8f5e0594504877a1c0095998e12,981125948236662726,FAILED,file:///Users/derekdewald/Documents/Python/Git...,2025-02-25 06:46:46.448000+00:00,2025-02-25 06:46:46.478000+00:00,,,,,/Users/derekdewald/anaconda3/envs/BaseRequirem...,derekdewald,,AdaBoostRegressor,LOCAL


In [61]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
run_id = runs_df.iloc[0]["run_id"]  # Get the first run's ID
artifacts = client.list_artifacts(run_id)

print("Artifacts stored in MLflow:")
for artifact in artifacts:
    print(artifact.path)


Artifacts stored in MLflow:
TweedieRegressor


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import all_estimators


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


import matplotlib.pyplot as plt


In [None]:

# Example usage:


In [None]:

import mlflow.sklearn
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

from sklearn.utils import all_estimators
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

# Function to run ML pipeline with MLflow tracking


from 



    # Additional models
    extra_models = [
        ("XGBoost", xgb.XGBClassifier() if task_type == "classification" else xgb.XGBRegressor()),
        ("LightGBM", lgb.LGBMClassifier() if task_type == "classification" else lgb.LGBMRegressor()),
        ("CatBoost", cat.CatBoostClassifier(silent=True) if task_type == "classification" else cat.CatBoostRegressor(silent=True))
    ]

    results = []

   
    # Display results in a pandas table
    import ace_tools as tools
    tools.display_dataframe_to_user(name="ML Model Performance with MLflow", dataframe=results_df)

# Example Usage
df = pd.read_csv("path_to_your_dataset.csv")  # Load your dataset
run_ml_pipeline_with_mlflow(df, target_column="target_column_name", task_type="classification")


In [None]:
def MLManualPipeline(df,
                     X_Cols,
                     y_Col,
                     scaler='MinMaxScaler',
                     model_list=['Linear Regression'],
                     test_size=.3,
                     random_state=42):

    if len(X_Cols) == 0:
        X = np.array(df.drop(y_Col,axis=1).copy())
    else:
        X = np.array(df[X_Cols])
    
    y = df[y_Col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if scaler =='MinMaxScaler':
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

    if len(model_list)==0:
        return X_train,X_test,y_train,y_test
    
    else:
        for model in model_list:
            if model == 'Linear Regression':
                lr = LinearRegression()
                lr.fit(X_train, y_train)
                y_pred_lr = lr.predict(X_test)
                
            elif model =='Logistic Regression':
                logreg=LogisticRegression()
                logreg.fit(X_train, y_train)
                y_pred = logreg.predict(X_test)
                print(f"Logisitic Regression Model:\n{confusion_matrix(y_test, y_pred)}\n{classification_report(y_test, y_pred)})")

            elif model =='Random Forest':
                
                ############################################ ESTIMATORS
                rf = RandomForestClassifier(random_state=random_state, n_estimators=25)
                rf.fit(X_train, y_train)
                y_pred_rf = rf.predict(X_test)
                print(f"Random Forest with 25 Nodes?>?>?>:\n{confusion_matrix(y_test, y_pred_rf)}\n{classification_report(y_test, y_pred_rf)})")

MLManualPipeline(df=df3.drop(['MEMBERNBR','ATTRITION_FLAG_1M'],axis=1),
                 X_Cols="",
                 y_Col='ATTRITION_FLAG_2M',
                model_list=['Logistic Regression','Random Forest'])


### Grid Search

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
}

# Grid Search
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_



## LSTM

In [None]:
# Build LSTM model
model = Sequential([
    LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Predict and display results
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)
print("Predictions:", y_pred_classes.flatten())
print("Actual:", y_test)

# Plot accuracy and loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.tight_layout()
plt.show()

## Neural Network

In [None]:
model = keras.models.Sequential()

model.add(layers.Dense(8,activation='relu'))
model.add(layers.Dense(64, activation='tanh'))
model.add(layers.Dense(512, activation='tanh'))
model.add(layers.Dense(1024, activation='tanh'))
model.add(layers.Dense(2028, activation='tanh'))
model.add(layers.Dense(512, activation='tanh'))
model.add(layers.Dense(512, activation='tanh'))
model.add(layers.Dense(128, activation='tanh'))
model.add(layers.Dense(32, activation='tanh'))
model.add(layers.Dense(1,activation='softmax'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])

# Train the model
model.fit(X_train.fillna(0), y_train, epochs=50, batch_size=1000, validation_split=0.1, verbose=1)



In [None]:
Feature crosses - add non linear relationships to linear model
Build up intuition gradually by slowly building model up.
Analyze Errors, does it make the mistakes that it’s making.
Accuracy and Error are dependent on a specific chosen threshold and are not differentiable. 
Convexity Good - No matter where you are, you will come to global minimum with following the right direction. 
Multinominal probability distribution - set of numbers equal to 1. 
Most Bias Model - Mean. Simple, Generalizes Well. Undercuts.
Models have noise, if you overfit the noise, it overfits. Variance.
Can perfectly fit n points with Polynomial of N-1. Pure Noise.

- What is your baseline

Output of a prediction, vs output of a bounding window, vs output which highlights area of interest.

Training Population and Usage Population
- NA Testing Group vs Asian Testing Group. Men vs Women. Etc..

Features - Need to generalize. Name of a car vs Engine, Age, Etc.
Turn Learning into a numerical optimization task. Computers really good at this. Opposed to Expert Logic.


Loss Function. 
Model is a function of the inputs. 
Loss is a function of the parameters of the model.
For MSE why a Parabolic Shape? - Squared Function.
Unlikely to generate a single Linear Function that perfectly fits all data points. Need to generate some super complex function
Positive Gradient - Reduce
Negative Gradient - Increase
Convergence / divergence
Greater the slope, the more we want to change

Global vs Local Minimum. No great solution.
Linear regression is convex, always get to global minima

When predicting a car, you need to find common features which generalize well.

Linear Model limitations
- Sensitive to size, color, orientation
- No concept of edges
- No concept of structure relationship between pixels

Edge detection
- Sharp change pixel intensity
- Spatial gradient
- Convolution