<a href="https://colab.research.google.com/github/diksha-139/MlFlow/blob/master/titanic_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-2.7.1-py3-none-any.whl (18.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<7,>=4.0.0 (from mlflow)
  Downloading docker-6.1.3-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━

In [None]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from urllib.parse import urlparse
import numpy as np
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(41)

    try:
        data_train = pd.read_csv("titanic_train_fe.csv")
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e
        )

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data_train)

    train_x = train[["Pclass","Sex","Age","SibSp","Parch","Fare_log","Embarked"]]
    test_x = test[["Pclass","Sex","Age","SibSp","Parch","Fare_log","Embarked"]]
    train_y = train[["Survived"]]
    test_y = test[["Survived"]]

    # Logistic Regression
    for solver in ['newton-cg','lbfgs','liblinear','sag','saga']:
        with mlflow.start_run(run_name="Logistic_Regressiot_Classifier"):
            lr = LogisticRegression()
            lr.fit(train_x, train_y)
            y_pred=lr.predict(test_x)
            score = lr.score(test_x,test_y)
            accuracy = metrics.accuracy_score(test_y, y_pred)
            precision = metrics.precision_score(test_y, y_pred)
            recall = metrics.recall_score(test_y, y_pred)

            print("Logistic Regression model (solver={}):".format(solver))
            print("Score (score={}):".format(score))
            print("Accuracy (accuracy={}):".format(accuracy))
            print("Precision (precision={}):".format(precision))
            print("Recall (recall={}):".format(recall))

            mlflow.log_param("solver", solver)
            mlflow.log_metric("score", score)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)

            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model

                mlflow.sklearn.log_model(lr, "model", registered_model_name="LogisticRegressionModel")
            else:
                mlflow.sklearn.log_model(lr, "model")

    # Random Forest Classifier
    for n_estimators in np.arange(10,100,10):
        with mlflow.start_run(run_name="Random_Forest_Classifier"):
            rf = RandomForestClassifier()
            rf.fit(train_x, train_y)
            y_pred=rf.predict(test_x)
            score = rf.score(test_x,test_y)
            accuracy = metrics.accuracy_score(test_y, y_pred)
            precision = metrics.precision_score(test_y, y_pred)
            recall = metrics.recall_score(test_y, y_pred)
            print("Random Forest model (n_estimators={}):".format(n_estimators))
            print("Score (score={}):".format(score))
            print("Accuracy (accuracy={}):".format(accuracy))
            print("Precision (precision={}):".format(precision))
            print("Recall (recall={}):".format(recall))

            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_metric("score", score)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

            # Model registry does not work with file store
            if tracking_url_type_store != "file":

                # Register the model

                mlflow.sklearn.log_model(rf, "model", registered_model_name="RandomForestClassifierModel")
            else:
                mlflow.sklearn.log_model(rf, "model")

    # Gradient Boosting Classifier
    for n_estimators in np.arange(10,100,10):
        for learning_rate in np.arange(0.7,1.0,0.1):
            with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
                gb = GradientBoostingClassifier()
                gb.fit(train_x, train_y)
                y_pred=gb.predict(test_x)
                score = gb.score(test_x,test_y)
                accuracy = metrics.accuracy_score(test_y, y_pred)
                precision = metrics.precision_score(test_y, y_pred)
                recall = metrics.recall_score(test_y, y_pred)
                print("Gradient Boosting Classifier (n_estimators={}):".format(n_estimators))
                print("Gradient Boosting Classifier (learning_rate={}):".format(learning_rate))
                print("Score (score={}):".format(score))
                print("Accuracy (accuracy={}):".format(accuracy))
                print("Precision (precision={}):".format(precision))
                print("Recall (recall={}):".format(recall))

                mlflow.log_param("n_estimators", n_estimators)
                mlflow.log_param("learning_rate", learning_rate)
                mlflow.log_metric("score", score)
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

                # Model registry does not work with file store
                if tracking_url_type_store != "file":

                    # Register the model

                    mlflow.sklearn.log_model(gb, "model", registered_model_name="GradientBoostingClassifierModel")
                else:
                    mlflow.sklearn.log_model(gb, "model")


Logistic Regression model (solver=newton-cg):
Score (score=0.7784090909090909):
Accuracy (accuracy=0.7784090909090909):
Precision (precision=0.7971014492753623):
Recall (recall=0.6875):
Logistic Regression model (solver=lbfgs):
Score (score=0.7784090909090909):
Accuracy (accuracy=0.7784090909090909):
Precision (precision=0.7971014492753623):
Recall (recall=0.6875):
Logistic Regression model (solver=liblinear):
Score (score=0.7784090909090909):
Accuracy (accuracy=0.7784090909090909):
Precision (precision=0.7971014492753623):
Recall (recall=0.6875):
Logistic Regression model (solver=sag):
Score (score=0.7784090909090909):
Accuracy (accuracy=0.7784090909090909):
Precision (precision=0.7971014492753623):
Recall (recall=0.6875):
Logistic Regression model (solver=saga):
Score (score=0.7784090909090909):
Accuracy (accuracy=0.7784090909090909):
Precision (precision=0.7971014492753623):
Recall (recall=0.6875):
Random Forest model (n_estimators=10):
Score (score=0.7954545454545454):
Accuracy (ac

In [None]:
!mlflow ui

[2023-10-22 14:10:33 +0000] [4784] [INFO] Starting gunicorn 21.2.0
[2023-10-22 14:10:33 +0000] [4784] [INFO] Listening at: http://127.0.0.1:5000 (4784)
[2023-10-22 14:10:33 +0000] [4784] [INFO] Using worker: sync
[2023-10-22 14:10:33 +0000] [4789] [INFO] Booting worker with pid: 4789
[2023-10-22 14:10:33 +0000] [4790] [INFO] Booting worker with pid: 4790
[2023-10-22 14:10:33 +0000] [4791] [INFO] Booting worker with pid: 4791
[2023-10-22 14:10:33 +0000] [4792] [INFO] Booting worker with pid: 4792
[2023-10-22 14:20:21 +0000] [4784] [INFO] Handling signal: int

Aborted!
[2023-10-22 14:20:21 +0000] [4789] [INFO] Worker exiting (pid: 4789)
[2023-10-22 14:20:21 +0000] [4790] [INFO] Worker exiting (pid: 4790)
[2023-10-22 14:20:21 +0000] [4791] [INFO] Worker exiting (pid: 4791)
[2023-10-22 14:20:21 +0000] [4792] [INFO] Worker exiting (pid: 4792)
[2023-10-22 14:20:22 +0000] [4784] [INFO] Shutting down: Master


In [None]:
sk_model = mlflow.sklearn.load_model("/content/mlruns/0/0506b8f644934b8883badd4b2a79588e/artifacts/model")

In [None]:
# Use Pandas DataFrame to make predictions
titanic_test_fe_df = pd.read_csv("titanic_test_fe.csv")
titanic_test_fe_df2 = titanic_test_fe_df[["Pclass","Sex","Age","SibSp","Parch","Fare_log","Embarked"]]
predictions = sk_model.predict(titanic_test_fe_df2)

In [None]:
output = pd.DataFrame({'PassengerId': titanic_test_fe_df.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Submission saved")

Submission saved
