In [1]:
%run data_prep

In [4]:
import data_prep

In [7]:
from data_prep import X_train_1, X_test_1, y_train_1, y_test_1, X_train_2, X_test_2, y_train_2, y_test_2

In [None]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

model_name = 'Logistic Regression'
model = LogisticRegression(class_weight='balanced')

with mlflow.start_run(run_name=f"{model_name}_fraud_data"):
    
    # Train on the fraud dataset
    model.fit(X_train_1, y_train_1)
    y_pred_1 = model.predict(X_test_1)
    
    # Log metrics
    accuracy_1 = accuracy_score(y_test_1, y_pred_1)
    f1_1 = f1_score(y_test_1, y_pred_1)
    mlflow.log_metric("accuracy", accuracy_1)
    mlflow.log_metric("f1_score", f1_1)

    # Log the model
    signature_1 = infer_signature(X_train_1, y_pred_1)
    mlflow.sklearn.log_model(
        model, model_name + "fraud_data",
        signature=signature_1,
        input_example=X_train_1.head(5)
    )

    print(f"{model_name} fraud_Data: Accuracy = {accuracy_1}, F1 Score = {f1_1}")

#Dataset 2 - Fine-tuning
with mlflow.start_run(run_name=f"{model_name}_dataset2"):
    
    # Fine-tune on the credit dataset
    model.fit(X_train_2, y_train_2)
    y_pred_2 = model.predict(X_test_2)
    
    # Log metrics
    accuracy_2 = accuracy_score(y_test_2, y_pred_2)
    f1_2 = f1_score(y_test_2, y_pred_2)
    mlflow.log_metric("accuracy", accuracy_2)
    mlflow.log_metric("f1_score", f1_2)

    # Log the model
    signature_2 = infer_signature(X_train_2, y_pred_2)
    mlflow.sklearn.log_model(
        model, model_name + "credit",
        signature=signature_2,
        input_example=X_train_2.head(5)
    )

    print(f"{model_name} credit : Accuracy = {accuracy_2}, F1 Score = {f1_2}")


In [None]:
import mlflow
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score
from mlflow.models.signature import infer_signature

def train_and_log_decision_tree(X, y, dataset_name):
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Define parameter grid for fine-tuning
    param_grid = {
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    # Initialize model and GridSearchCV for tuning
    tree_model = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

    # Train and tune the model
    grid_search.fit(X_train, y_train)
    best_tree_model = grid_search.best_estimator_
    y_pred = best_tree_model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Dataset: {dataset_name} | Accuracy: {accuracy}, F1 Score: {f1}")

    # Log the model and metrics with MLflow
    with mlflow.start_run(run_name=f'Decision Tree on {dataset_name}'):
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Infer model signature
        signature = infer_signature(X_train, y_pred)
        mlflow.sklearn.log_model(
            best_tree_model, f"Decision Tree - {dataset_name}",
            signature=signature,
            input_example=X_train.head(5)
        )

# Example usage for two datasets: X1, y1 and X2, y2
train_and_log_decision_tree(X1, y1, "Dataset 1")
train_and_log_decision_tree(X2, y2, "Dataset 2")
