In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn

# Load Iris dataset
iris = load_iris()
data = np.c_[iris.data, iris.target]
columns = np.append(iris.feature_names, ["target"])
df = pd.DataFrame(data, columns=columns)

def automated_pipeline(df, target_variable, features_to_predict):
    # Select Features for modeling
    df_model = df[features_to_predict + [target_variable]]

    # Filter Category from underlying_index
    cat_feature = [feature for feature in df_model.columns if df_model[feature].dtype == 'O']
    numeric_features = [feature for feature in df_model.columns if df_model[feature].dtype != 'O']

    # Handle missing values in categorical and numeric features
    df_model[cat_feature] = df_model[cat_feature].fillna('OTHER')
    df_model[numeric_features] = df_model[numeric_features].fillna(0)

    # Step 2: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df_model.drop(target_variable, axis=1),
                                                        df_model[target_variable],
                                                        test_size=0.2,
                                                        random_state=42)

    # Step 3: Feature selection using Random Forest
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)

    # Get feature importances
    feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                       index=X_train.columns,
                                       columns=['importance']).sort_values('importance', ascending=False)

    # Select top 20 features
    top_features = feature_importances.head(20).index.tolist()

    # Filter data with top features
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    # Step 4: Standard Scaling
    scaler_standard = StandardScaler()
    X_train_standard = scaler_standard.fit_transform(X_train_selected)
    X_test_standard = scaler_standard.transform(X_test_selected)

    # Step 5: Robust Scaling
    scaler_robust = RobustScaler()
    X_train_robust = scaler_robust.fit_transform(X_train_standard)
    X_test_robust = scaler_robust.transform(X_test_standard)

    # Step 6: Build a Random Forest model with hyperparameter tuning
    param_dist = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    rf_model_standard = RandomForestClassifier(random_state=42)

    # Using RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(
        rf_model_standard,
        param_distributions=param_dist,
        n_iter=10,
        cv=StratifiedKFold(n_splits=5),
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )

    # Log parameters to MLflow
    with mlflow.start_run():
        mlflow.log_param("features_to_predict", features_to_predict)
        mlflow.log_param("target_variable", target_variable)

        # Train the model
        random_search.fit(X_train_robust, y_train)

        # Log hyperparameters to MLflow
        mlflow.log_params(random_search.best_params_)

        # Make predictions
        y_pred_standard = random_search.predict(X_test_robust)

        # Step 7: Evaluate the model and log metrics to MLflow
        accuracy = accuracy_score(y_test, y_pred_standard)
        mlflow.log_metric("accuracy", accuracy)
        print("Random Forest Model with Hyperparameter Tuning:")
        print("Top 20 Features:", top_features)
        print("Best Hyperparameters:", random_search.best_params_)
        print("Accuracy:", accuracy)
        print("Classification Report:\n", classification_report(y_test, y_pred_standard))

# Example usage:
# Specify the target variable and features to predict
target_variable = 'target'
features_to_predict = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

# Call the automated pipeline function
automated_pipeline(df, target_variable, features_to_predict)


Random Forest Model with Hyperparameter Tuning:
Top 20 Features: ['petal length (cm)', 'petal width (cm)', 'sepal length (cm)', 'sepal width (cm)']
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        10
         1.0       1.00      1.00      1.00         9
         2.0       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

