Load & Prepare Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load dataset
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"
df = pd.read_parquet(url)

# Feature engineering
df['trip_duration'] = (pd.to_datetime(df['tpep_dropoff_datetime']) - pd.to_datetime(df['tpep_pickup_datetime'])).dt.total_seconds() / 60
df = df[(df['trip_duration'] > 0) & (df['trip_duration'] < 240)]

features = ['trip_distance', 'passenger_count', 'trip_duration']
target = 'fare_amount'
df = df[features + [target]].dropna()

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)


Train & Log with MLflow

In [4]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor

# Set tracking URI (local MLflow setup)
mlflow.set_tracking_uri("http://localhost:5000")  # Or leave blank for default file-based logging

with mlflow.start_run(run_name="xgboost_taxi_fare_lab"):

    # Define and train model
    model = XGBRegressor(n_estimators=5, max_depth=3, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    # Predict and calculate MAE
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)

    '''
    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 4)
    mlflow.log_param("learning_rate", 0.1)
    '''

    # Log metrics
    mlflow.log_metric("mae", mae)

    # Log model
    mlflow.sklearn.log_model(model, "model")

    print(f"MAE logged: {mae:.2f}")




MAE logged: 7.94
🏃 View run xgboost_taxi_fare_lab at: http://localhost:5000/#/experiments/0/runs/250ab0bcf5c849e7b2c1648b66dac00c
🧪 View experiment at: http://localhost:5000/#/experiments/0


Start MLflow UI

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import mlflow.pyfunc
import uvicorn
import os
import pickle

app = FastAPI()

MODEL_NAME = os.getenv("MODEL_NAME", "green-duration-model")
# MODEL_STAGE = os.getenv("MODEL_STAGE", "Production")
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://172.191.241.245/:5000")

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model = mlflow.pyfunc.load_model(model_uri=f"models:/{MODEL_NAME}")
print(f"Model loaded from: {MLFLOW_TRACKING_URI}")
with open("dv.pkl", "rb") as f_in:
    dv = pickle.load(f_in)

class RideFeatures(BaseModel):
    PULocationID: str
    DOLocationID: str
    trip_distance: float
    pickup_hour: int

@app.post("/predict")
async def predict(ride: RideFeatures):
    X = dv.transform([ride.dict()])
    pred = model.predict(X)
    return {"ride_duration": float(pred[0])}



MlflowException: API request to http://172.191.241.245/:5000/api/2.0/mlflow/registered-models/get-latest-versions failed with exception HTTPConnectionPool(host='172.191.241.245', port=80): Max retries exceeded with url: /:5000/api/2.0/mlflow/registered-models/get-latest-versions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002423A625040>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))