In [None]:
import requests
import datetime
import pandas as pd
import numpy as np
from pathlib import Path

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric, DatasetSummaryMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Create directories if they don't exist
Path("data").mkdir(exist_ok=True)
Path("models").mkdir(exist_ok=True)


In [None]:
def download_data():
    """Download March 2024 Green Taxi data"""
    
    file = 'green_tripdata_2024-03.parquet'
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    save_path = f"data/{file}"
    
    if not Path(save_path).exists():
        print(f"Downloading {file}...")
        resp = requests.get(url, stream=True)
        with open(save_path, "wb") as handle:
            for data in tqdm(resp.iter_content(), 
                            desc=f"{file}",
                            postfix=f"save to {save_path}",
                            total=int(resp.headers["Content-Length"])):
                handle.write(data)
    else:
        print(f"{file} already exists, skipping download")
    
    return save_path


In [None]:
def prepare_data(data_path):
    """Load and prepare the taxi data"""
    print("=== Loading and preparing data ===")
    
    # Load data
    mar_data = pd.read_parquet(data_path)
    print(f" Data shape is {mar_data.shape} - {mar_data.shape[0]} rows")
    
    # Create target variable
    mar_data["duration_min"] = mar_data.lpep_dropoff_datetime - mar_data.lpep_pickup_datetime
    mar_data.duration_min = mar_data.duration_min.apply(lambda td: float(td.total_seconds())/60)
    
    # Filter outliers
    mar_data = mar_data[(mar_data.duration_min >= 0) & (mar_data.duration_min <= 60)]
    mar_data = mar_data[(mar_data.passenger_count > 0) & (mar_data.passenger_count <= 8)]
    
    print(f"After filtering: {mar_data.shape}")
    
    return mar_data


In [None]:
def train_model(mar_data):
    """Train the baseline linear regression model"""
    print("=== Training baseline model ===")
    
    # Define features
    target = "duration_min"
    num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
    cat_features = ["PULocationID", "DOLocationID"]
    
    # Split data
    train_data = mar_data[:30000].copy()
    val_data = mar_data[30000:].copy()
    
    # Train model
    model = LinearRegression()
    model.fit(train_data[num_features + cat_features], train_data[target])
    
    # Make predictions
    train_preds = model.predict(train_data[num_features + cat_features])
    val_preds = model.predict(val_data[num_features + cat_features])
    
    train_data['prediction'] = train_preds
    val_data['prediction'] = val_preds
    
    # Print metrics
    train_mae = mean_absolute_error(train_data.duration_min, train_data.prediction)
    val_mae = mean_absolute_error(val_data.duration_min, val_data.prediction)
    print(f"Train MAE: {train_mae:.4f}")
    print(f"Validation MAE: {val_mae:.4f}")
    
    # Save model and reference data
    with open('models/lin_reg.bin', 'wb') as f_out:
        dump(model, f_out)
    
    val_data.to_parquet('data/reference.parquet')
    
    return train_data, val_data, num_features, cat_features


In [None]:
def setup_monitoring(train_data, val_data, num_features, cat_features):
    """Set up monitoring with Evidently metrics"""
    
    # Column mapping - only include numeric and categorical features, exclude datetime
    column_mapping = ColumnMapping(
        target=None,
        prediction='prediction',
        numerical_features=num_features,
        categorical_features=cat_features
    )
    
    report = Report(metrics=[
        ColumnDriftMetric(column_name='prediction'),
        DatasetDriftMetric(),
        DatasetMissingValuesMetric(),
        ColumnQuantileMetric(column_name='fare_amount', quantile=0.5),  
        DatasetSummaryMetric() 
    ])
    
    # Run initial report
    report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)
    
    
    return column_mapping


In [None]:
def daily_monitoring(mar_data, column_mapping):
    
    # Add date column
    mar_data['date'] = mar_data.lpep_pickup_datetime.dt.date
    unique_dates = sorted(mar_data['date'].unique())
    
    quantile_values = []
    daily_results = []
    
    print(f"Processing {len(unique_dates)} days...")
    
    for i, date in enumerate(unique_dates):
        # Filter data for this specific date
        daily_data = mar_data[mar_data['date'] == date].copy()
        
        if len(daily_data) > 10:  # Only process if there's sufficient data
            try:
                # Create daily report with quantile metric
                daily_report = Report(
                    metrics=[ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)],
                    timestamp=datetime.datetime.combine(date, datetime.time())
                )
                
                # Run the report
                daily_report.run(reference_data=None, current_data=daily_data, column_mapping=column_mapping)
                
                # Extract quantile value
                result = daily_report.as_dict()
                quantile_value = result['metrics'][0]['result']['current']['value']
                quantile_values.append(quantile_value)
                daily_results.append((date, quantile_value, len(daily_data)))
                
                print(f"Date: {date}, Rows: {len(daily_data):4d}, Quantile 0.5: {quantile_value:.2f}")
                
            except Exception as e:
                print(f"Error processing {date}: {e}")
                continue
    
    if not quantile_values:
        print("No quantile values calculated!")
        return None
    
    # Find maximum quantile value
    max_quantile = max(quantile_values)
    max_date = daily_results[quantile_values.index(max_quantile)][0]
    
    print(f"\nMaximum quantile 0.5 value for fare_amount in March 2024: {max_quantile:.1f}")
    print(f"This maximum occurred on: {max_date}")
    
    # Check against multiple choice options
    options = [10, 12.5, 14.2, 14.8]
    closest_option = min(options, key=lambda x: abs(x - max_quantile))
    
    return max_quantile


In [None]:
# Download and prepare data
data_path = download_data()
mar_data = prepare_data(data_path)


green_tripdata_2024-03.parquet already exists, skipping download
=== Loading and preparing data ===
 Data shape is (57457, 20) - 57457 rows
After filtering: (54135, 21)


In [80]:
# Train baseline model
train_data, val_data, num_features, cat_features = train_model(mar_data)


=== Training baseline model ===
Train MAE: 3.7725
Validation MAE: 3.7168


In [81]:
# Q2: Setup monitoring with expanded metrics
column_mapping = setup_monitoring(train_data, val_data, num_features, cat_features)


In [82]:
# Q3: Daily monitoring to find maximum quantile value
max_quantile = daily_monitoring(mar_data, column_mapping)


Processing 37 days...
Date: 2024-03-01, Rows: 1994, Quantile 0.5: 13.50
Date: 2024-03-02, Rows: 1509, Quantile 0.5: 12.80
Date: 2024-03-03, Rows: 1385, Quantile 0.5: 14.20
Date: 2024-03-04, Rows: 1820, Quantile 0.5: 12.80
Date: 2024-03-05, Rows: 1867, Quantile 0.5: 12.80
Date: 2024-03-06, Rows: 2174, Quantile 0.5: 12.80
Date: 2024-03-07, Rows: 2012, Quantile 0.5: 13.50
Date: 2024-03-08, Rows: 1964, Quantile 0.5: 12.80
Date: 2024-03-09, Rows: 1655, Quantile 0.5: 13.50
Date: 2024-03-10, Rows: 1337, Quantile 0.5: 14.20
Date: 2024-03-11, Rows: 1742, Quantile 0.5: 12.80
Date: 2024-03-12, Rows: 1792, Quantile 0.5: 12.80
Date: 2024-03-13, Rows: 1961, Quantile 0.5: 13.50
Date: 2024-03-14, Rows: 1974, Quantile 0.5: 14.20
Date: 2024-03-15, Rows: 1906, Quantile 0.5: 13.50
Date: 2024-03-16, Rows: 1580, Quantile 0.5: 13.50
Date: 2024-03-17, Rows: 1349, Quantile 0.5: 13.50
Date: 2024-03-18, Rows: 1775, Quantile 0.5: 12.80
Date: 2024-03-19, Rows: 1859, Quantile 0.5: 13.50
Date: 2024-03-20, Rows: 1938