<a href="https://colab.research.google.com/github/fatemafaria142/Exploring-Different-Models-with-MLFlow/blob/main/Project1_Restaurants_Revenue_Prediction_MLFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Installing MLflow and pyngrok**

In [1]:
%%capture
!pip install mlflow
!pip install pyngrok

#### **Mount Google Drive in Colab**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### **Importing Necessary Libraries for MLflow and ngrok Setup**

In [83]:
import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass

#### **MLflow Tracking UI Setup for Experiment Visualization**

In [84]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI])

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

#### **Set our tracking server uri for logging**

In [85]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

* **Create a account in** https://dashboard.ngrok.com/

In [86]:
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = getpass.getpass()
port=5000
public_url = ngrok.connect(port).public_url
print(f' * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"')

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
··········
 * ngrok tunnel "https://b6cd-35-229-38-176.ngrok-free.app" -> "http://127.0.0.1:5000"


#### **Data Preprocessing**

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/Kaggle Datasets/Restaurant_revenue.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Number_of_Customers,Menu_Price,Marketing_Spend,Cuisine_Type,Average_Customer_Spending,Promotions,Reviews,Monthly_Revenue
0,61,43.117635,12.663793,Japanese,36.236133,0,45,350.91204
1,24,40.020077,4.577892,Italian,17.952562,0,36,221.319091
2,81,41.981485,4.652911,Japanese,22.60042,1,91,326.529763
3,70,43.005307,4.416053,Italian,18.984098,1,59,348.190573
4,30,17.456199,3.475052,Italian,12.766143,1,30,185.009121


#### **Now, let's go through some common preprocessing steps**

In [64]:
# Check for missing values
print(df.isnull().sum())

Number_of_Customers          0
Menu_Price                   0
Marketing_Spend              0
Cuisine_Type                 0
Average_Customer_Spending    0
Promotions                   0
Reviews                      0
Monthly_Revenue              0
dtype: int64


In [65]:
# Check the data types of each column
print(df.dtypes)

Number_of_Customers            int64
Menu_Price                   float64
Marketing_Spend              float64
Cuisine_Type                  object
Average_Customer_Spending    float64
Promotions                     int64
Reviews                        int64
Monthly_Revenue              float64
dtype: object


#### **Feature Scaling**
* Standardizing ensures that numerical features are on a similar scale, which can be important for some machine learning algorithms.

In [66]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Average_Customer_Spending', 'Promotions', 'Reviews']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [67]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Number_of_Customers,Menu_Price,Marketing_Spend,Cuisine_Type,Average_Customer_Spending,Promotions,Reviews,Monthly_Revenue
0,0.293301,1.144183,0.462985,Japanese,0.589489,-0.994018,-0.165584,350.91204
1,-1.110781,0.86941,-0.920956,Italian,-1.005109,-0.994018,-0.47368,221.319091
2,1.052265,1.043399,-0.908116,Japanese,-0.599747,1.006018,1.409126,326.529763
3,0.634835,1.134219,-0.948655,Italian,-0.915143,1.006018,0.313675,348.190573
4,-0.883092,-1.132155,-1.109712,Italian,-1.457441,1.006018,-0.679077,185.009121


In [68]:
# Check the number of unique values in the 'Cuisine_Type' column
cuisine_type_count = df['Cuisine_Type'].nunique()

# Display the result
print(f"Number of unique values in 'Cuisine_Type': {cuisine_type_count}")


Number of unique values in 'Cuisine_Type': 4


In [69]:
# Display the unique values in the 'Cuisine_Type' column
cuisine_types = df['Cuisine_Type'].unique()

# Display the result
print("Unique values in 'Cuisine_Type':", cuisine_types)


Unique values in 'Cuisine_Type': ['Japanese' 'Italian' 'American' 'Mexican']


In [70]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Apply label encoding to 'Cuisine_Type'
df['Cuisine_Type_LabelEncoded'] = label_encoder.fit_transform(df['Cuisine_Type'])

# Display the updated dataframe
df.head()


Unnamed: 0,Number_of_Customers,Menu_Price,Marketing_Spend,Cuisine_Type,Average_Customer_Spending,Promotions,Reviews,Monthly_Revenue,Cuisine_Type_LabelEncoded
0,0.293301,1.144183,0.462985,Japanese,0.589489,-0.994018,-0.165584,350.91204,2
1,-1.110781,0.86941,-0.920956,Italian,-1.005109,-0.994018,-0.47368,221.319091,1
2,1.052265,1.043399,-0.908116,Japanese,-0.599747,1.006018,1.409126,326.529763,2
3,0.634835,1.134219,-0.948655,Italian,-0.915143,1.006018,0.313675,348.190573,1
4,-0.883092,-1.132155,-1.109712,Italian,-1.457441,1.006018,-0.679077,185.009121,1


#### **Exploratory Data Analysis (EDA)**

In [54]:
# Display summary statistics of the numerical columns
# This will provide you with basic statistics such as mean, standard deviation, minimum, 25th percentile, median (50th percentile), 75th percentile, and maximum for each numerical column.
df.describe()

Unnamed: 0,Number_of_Customers,Menu_Price,Marketing_Spend,Average_Customer_Spending,Promotions,Reviews,Monthly_Revenue,Cuisine_Type_LabelEncoded
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,53.271,30.21912,9.958726,29.477085,0.497,49.837,268.724172,1.506
std,26.364914,11.27876,5.845586,11.471686,0.500241,29.226334,103.98295,1.123934
min,10.0,10.009501,0.003768,10.037177,0.0,0.0,-28.977809,0.0
25%,30.0,20.396828,4.690724,19.603041,0.0,24.0,197.103642,0.0
50%,54.0,30.860614,10.092047,29.251365,0.0,50.0,270.213964,2.0
75%,74.0,39.843868,14.992436,39.55322,1.0,76.0,343.395793,2.25
max,99.0,49.97414,19.994276,49.900725,1.0,99.0,563.381332,3.0


#### **Machine Learning Regression Model Comparison using scikit-learn and MLflow**

In [91]:
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [80]:
# Select features and target variable
features = ['Number_of_Customers', 'Menu_Price', 'Marketing_Spend', 'Average_Customer_Spending', 'Promotions', 'Reviews', 'Cuisine_Type_LabelEncoded']
target = 'Monthly_Revenue'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

#### **Create a new MLflow Experiment**

In [81]:
mlflow.set_experiment("Restaurants_Revenue_Prediction") # Create a new MLflow Experiment if it doesn't exist

2024/01/27 14:45:18 INFO mlflow.tracking.fluent: Experiment with name 'Restaurants_Revenue_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/mlruns/1', creation_time=1706366718445, experiment_id='1', last_update_time=1706366718445, lifecycle_stage='active', name='Restaurants_Revenue_Prediction', tags={}>

### **1. Linear Regression Experiment** ✅

In [89]:
# Start an MLflow run
with mlflow.start_run(run_name="Revenue Prediction using Linear Regression") as run:

    # train the model
    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)
    predictions = linear_model.predict(X_test)

    # save the model artifact for deployment
    # this will save the model locally or to the S3 bucket if using a server
    mlflow.sklearn.log_model(linear_model, "linear-regression-model-3") # Change this name everytime while running the model

    # log model performance
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)  # Log the loss metric
    print("  mse: %f" % mse)

    # log model performance
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mlflow.log_metric("rmse", rmse)  # Log the RMSE metric
    print("  rmse: %f" % rmse)

    # log model performance
    mae = mean_absolute_error(y_test, predictions)
    mlflow.log_metric("mae", mae)
    print("  mae: %f" % mae)

   # log model performance
    r2 = r2_score(y_test, predictions)
    mlflow.log_metric("r2", r2)
    print("  r2 score: %f" % r2)

    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    mlflow.end_run()
    print(mlflow.get_artifact_uri())
    print("runID: %s" % run_id)

  mse: 3559.480160
  rmse: 59.661379
  mae: 47.154402
  r2 score: 0.674150
/content/mlruns/1/36cd2fb54bbc4cfb8adb23ae9b4c732d/artifacts
runID: 01900874458e4a95adaa092483cc068e


### **2. Decision Tree Regression Experiment** ✅

In [90]:
# Start an MLflow run
with mlflow.start_run(run_name="Revenue Prediction using Decision Tree Regression") as run:

    # add parameters for tuning
    criterion='squared_error'
    splitter='best'
    max_depth=10
    min_samples_split=2
    min_samples_leaf=1

    mlflow.log_param("criterion",criterion) # Log the hyperparameters
    mlflow.log_param("splitter",splitter) # Log the hyperparameters
    mlflow.log_param("max_depth",max_depth) # Log the hyperparameters
    mlflow.log_param("min_samples_split",min_samples_split) # Log the hyperparameters
    mlflow.log_param("min_samples_leaf",min_samples_leaf) # Log the hyperparameters

    # train the model
    DecisionTreeRegressor_model = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    DecisionTreeRegressor_model.fit(X_train, y_train)
    predictions = DecisionTreeRegressor_model.predict(X_test)

    # save the model artifact for deployment
    # this will save the model locally or to the S3 bucket if using a server
    mlflow.sklearn.log_model(DecisionTreeRegressor_model, "decision-tree-regression-model-1") # Change this name everytime while running the model

    # log model performance
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)  # Log the MSE metric
    print("  mse: %f" % mse)

    # log model performance
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mlflow.log_metric("rmse", rmse)  # Log the RMSE metric
    print("  rmse: %f" % rmse)

    # log model performance
    mae = mean_absolute_error(y_test, predictions)
    mlflow.log_metric("mae", mae)  # Log the MAE metric
    print("  mae: %f" % mae)

   # log model performance
    r2 = r2_score(y_test, predictions)
    mlflow.log_metric("r2", r2)  # Log the R2 metric
    print("  r2 score: %f" % r2)

    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    mlflow.end_run()
    print(mlflow.get_artifact_uri())
    print("runID: %s" % run_id)

  mse: 7070.005207
  rmse: 84.083323
  mae: 68.006277
  r2 score: 0.352782
/content/mlruns/1/93fd1ce70e2047e7bf9f09fe1a658f04/artifacts
runID: 172f7d9a5d5344a5bcd7d20bd3b7fd3e


### **3. Random Forest Regression Experiment** ✅

In [93]:
# Start an MLflow run
with mlflow.start_run(run_name="Revenue Prediction using Random Forest Regression") as run:

    # add parameters for tuning
    n_estimators=50
    criterion='squared_error'
    max_depth=5
    min_samples_split=2
    min_samples_leaf=1

    mlflow.log_param("n_estimators",n_estimators) # Log the hyperparameters
    mlflow.log_param("criterion",criterion) # Log the hyperparameters
    mlflow.log_param("max_depth",max_depth) # Log the hyperparameters
    mlflow.log_param("min_samples_split",min_samples_split) # Log the hyperparameters
    mlflow.log_param("min_samples_leaf",min_samples_leaf) # Log the hyperparameters

    # train the model
    RandomForestRegressor_model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion,max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    RandomForestRegressor_model.fit(X_train, y_train)
    predictions = RandomForestRegressor_model.predict(X_test)

    # save the model artifact for deployment
    # this will save the model locally or to the S3 bucket if using a server
    mlflow.sklearn.log_model(RandomForestRegressor_model, "random-forest-regression-model-2") # Change this name everytime while running the model

    # log model performance
    mse = mean_squared_error(y_test, predictions)
    mlflow.log_metric("mse", mse)  # Log the MSE metric
    print("  mse: %f" % mse)

    # log model performance
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mlflow.log_metric("rmse", rmse)  # Log the RMSE metric
    print("  rmse: %f" % rmse)

    # log model performance
    mae = mean_absolute_error(y_test, predictions)
    mlflow.log_metric("mae", mae)  # Log the MAE metric
    print("  mae: %f" % mae)

   # log model performance
    r2 = r2_score(y_test, predictions)
    mlflow.log_metric("r2", r2)  # Log the R2 metric
    print("  r2 score: %f" % r2)

    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    mlflow.end_run()
    print(mlflow.get_artifact_uri())
    print("runID: %s" % run_id)

  mse: 3901.557375
  rmse: 62.462448
  mae: 50.359078
  r2 score: 0.642835
/content/mlruns/1/f08171d2ad224f86adc33a322012fdbb/artifacts
runID: 1a0670d4e6a6444dbaf771ddd627c303
