In [3]:
# Import necessary libraries
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Set MLflow tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Replace with the MLflow tracking server URI

# Assuming your DataFrame is named 'final_merged_data' and your target column is 'promotion'
df = pd.read_csv('data.csv')
X = df.drop('promotion', axis=1)  # Features (drop the 'promotion' column)
y = df['promotion']  # Target variable (promotion)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Feature scaling (important for algorithms like Logistic Regression, SVM, etc.)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define classifiers with parameters
classifiers = {
    "Logistic Regression": LogisticRegression(C=1.0, max_iter=100),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    "SVM": SVC(C=1.0, kernel='rbf', probability=True)  # Set probability=True for SVM to compute ROC-AUC
}

# Start MLflow tracking
mlflow.set_experiment("promotion_prediction")

for name, clf in classifiers.items():
    with mlflow.start_run(run_name=name):
        
        # Log only two key parameters for each model
        if name == "Logistic Regression":
            mlflow.log_param("C", clf.C)
            mlflow.log_param("max_iter", clf.max_iter)
            
        elif name == "Decision Tree":
            mlflow.log_param("max_depth", clf.max_depth)
            mlflow.log_param("min_samples_split", clf.min_samples_split)
            
        elif name == "Random Forest":
            mlflow.log_param("n_estimators", clf.n_estimators)
            mlflow.log_param("max_depth", clf.max_depth)
            
        elif name == "Gradient Boosting":
            mlflow.log_param("n_estimators", clf.n_estimators)
            mlflow.log_param("learning_rate", clf.learning_rate)
            
        elif name == "SVM":
            mlflow.log_param("C", clf.C)
            mlflow.log_param("kernel", clf.kernel)
        
        # Train the model
        clf.fit(X_train, y_train)
        
        # Get probabilities for ROC calculation (using only positive class probability)
        y_train_prob = clf.predict_proba(X_train)[:, 1]
        y_test_prob = clf.predict_proba(X_test)[:, 1]

        # Calculate metrics for train and test sets
        auc_train = roc_auc_score(y_train, y_train_prob)
        auc_test = roc_auc_score(y_test, y_test_prob)
        f1_train = f1_score(y_train, clf.predict(X_train))
        f1_test = f1_score(y_test, clf.predict(X_test))
        accuracy_test = accuracy_score(y_test, clf.predict(X_test))

        # Log metrics to MLflow
        mlflow.log_metric("train_auc", auc_train)
        mlflow.log_metric("test_auc", auc_test)
        mlflow.log_metric("train_f1", f1_train)
        mlflow.log_metric("test_f1", f1_test)
        mlflow.log_metric("test_accuracy", accuracy_test)

        # Log model
        mlflow.sklearn.log_model(clf, "model")


2024/10/29 11:01:36 INFO mlflow.tracking.fluent: Experiment with name 'promotion_prediction' does not exist. Creating a new experiment.
2024/10/29 11:01:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/479887071395289261/runs/51a9c6647692467b8b1654e70d67fd04.
2024/10/29 11:01:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/479887071395289261.
2024/10/29 11:01:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: http://127.0.0.1:5000/#/experiments/479887071395289261/runs/961e59d23f8d4267aeb95881ac5f1842.
2024/10/29 11:01:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/479887071395289261.
2024/10/29 11:01:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/479887071395289261/runs/bec6ec6394a8473ba2c665a4e6e3a0e9

In [4]:
# Import necessary libraries
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Set MLflow tracking URI to your DagsHub repository
#mlflow.set_tracking_uri("https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow")  # Replace with your DagsHub repo URL

# Load your dataset
df = pd.read_csv('data.csv')
X = df.drop('promotion', axis=1)
y = df['promotion']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define classifiers with parameters
classifiers = {
    "Logistic Regression": LogisticRegression(C=1.0, max_iter=100),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    "SVM": SVC(C=1.0, kernel='rbf', probability=True)
}
import dagshub
dagshub.init(repo_owner='jenishaedwin2003', repo_name='promotion_pred_mlflow', mlflow=True)
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'jenishaedwin2003' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'ec427fbf913e30f6a266ccf70444aee7b8ba5719' 
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow'
# Start MLflow experiment
mlflow.set_experiment("promotion_prediction")

for name, clf in classifiers.items():
    with mlflow.start_run(run_name=name):
        
        # Log two key parameters for each model
        if name == "Logistic Regression":
            mlflow.log_param("C", clf.C)
            mlflow.log_param("max_iter", clf.max_iter)
            
        elif name == "Decision Tree":
            mlflow.log_param("max_depth", clf.max_depth)
            mlflow.log_param("min_samples_split", clf.min_samples_split)
            
        elif name == "Random Forest":
            mlflow.log_param("n_estimators", clf.n_estimators)
            mlflow.log_param("max_depth", clf.max_depth)
            
        elif name == "Gradient Boosting":
            mlflow.log_param("n_estimators", clf.n_estimators)
            mlflow.log_param("learning_rate", clf.learning_rate)
            
        elif name == "SVM":
            mlflow.log_param("C", clf.C)
            mlflow.log_param("kernel", clf.kernel)
        
        # Train the model
        clf.fit(X_train, y_train)
        
        # Get probabilities for ROC calculation
        y_train_prob = clf.predict_proba(X_train)[:, 1]
        y_test_prob = clf.predict_proba(X_test)[:, 1]

        # Calculate metrics
        auc_train = roc_auc_score(y_train, y_train_prob)
        auc_test = roc_auc_score(y_test, y_test_prob)
        f1_train = f1_score(y_train, clf.predict(X_train))
        f1_test = f1_score(y_test, clf.predict(X_test))
        accuracy_test = accuracy_score(y_test, clf.predict(X_test))

        # Log metrics to MLflow
        mlflow.log_metric("train_auc", auc_train)
        mlflow.log_metric("test_auc", auc_test)
        mlflow.log_metric("train_f1", f1_train)
        mlflow.log_metric("test_f1", f1_test)
        mlflow.log_metric("test_accuracy", accuracy_test)

        # Log the model
        mlflow.sklearn.log_model(clf, "model")


2024/10/29 11:33:55 INFO mlflow.tracking.fluent: Experiment with name 'promotion_prediction' does not exist. Creating a new experiment.
2024/10/29 11:34:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow/#/experiments/0/runs/79d74f42cdcb4f7aa9f90e8a03f1d596.
2024/10/29 11:34:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow/#/experiments/0.
2024/10/29 11:34:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow/#/experiments/0/runs/85991c7c489747118caa512c2e74c0d5.
2024/10/29 11:34:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/jenishaedwin2003/promotion_pred_mlflow.mlflow/#/experiments/0.
2024/10/29 11:34:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run Ra