In [42]:
import requests
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import root_mean_squared_error

## Data Loading

In [1]:
!mkdir data

In [6]:
files = [('green_tripdata_2024-10.parquet', './data'), 
         ('green_tripdata_2024-11.parquet', './data')]

print("Downloading started:...")

for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    save_path = f"{path}/{file}"
    response = requests.get(url, stream=True)

    with open(save_path, "wb") as handle:
        for data in response.iter_content(chunk_size=1024):
            handle.write(data)
    

Downloading started:...


In [8]:
train_data = pd.read_parquet("data/green_tripdata_2024-10.parquet")
test_data = pd.read_parquet("data/green_tripdata_2024-11.parquet")

In [9]:
train_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-10-01 00:52:13,2024-10-01 01:02:39,N,1.0,75,238,1.0,2.1,12.8,1.0,0.5,0.0,0.0,,1.0,18.05,1.0,1.0,2.75
1,2,2024-10-01 00:56:34,2024-10-01 01:03:51,N,1.0,134,82,1.0,4.86,19.8,1.0,0.5,0.0,0.0,,1.0,22.3,2.0,1.0,0.0
2,2,2024-10-01 00:23:31,2024-10-01 00:45:17,N,1.0,202,260,1.0,3.77,22.6,1.0,0.5,0.0,0.0,,1.0,25.1,2.0,1.0,0.0
3,2,2024-10-01 00:25:02,2024-10-01 00:37:16,N,1.0,130,218,1.0,3.11,15.6,1.0,0.5,0.0,0.0,,1.0,18.1,2.0,1.0,0.0
4,2,2024-10-01 00:11:11,2024-10-01 00:25:43,N,1.0,42,94,2.0,4.48,21.9,1.0,0.5,1.0,0.0,,1.0,25.4,1.0,1.0,0.0


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56147 entries, 0 to 56146
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               56147 non-null  int32         
 1   lpep_pickup_datetime   56147 non-null  datetime64[us]
 2   lpep_dropoff_datetime  56147 non-null  datetime64[us]
 3   store_and_fwd_flag     54502 non-null  object        
 4   RatecodeID             54502 non-null  float64       
 5   PULocationID           56147 non-null  int32         
 6   DOLocationID           56147 non-null  int32         
 7   passenger_count        54502 non-null  float64       
 8   trip_distance          56147 non-null  float64       
 9   fare_amount            56147 non-null  float64       
 10  extra                  56147 non-null  float64       
 11  mta_tax                56147 non-null  float64       
 12  tip_amount             56147 non-null  float64       
 13  t

In [10]:
num_features = ['total_amount', 'trip_distance', 'passenger_count']
cat_features = ['PULocationID', 'DOLocationID']

In [37]:
def preprocessing(data):
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data.duration = data.duration.apply(lambda td: td.total_seconds()/60)
    data = data[(data.duration >= 3.) & (data.duration <= 90.)] 
    data.fillna(0, inplace=True) #maybe debug this later
    return data

In [38]:
X_train = preprocessing(train_data)[num_features + cat_features]
y_train = preprocessing(train_data)['duration']

X_test = preprocessing(test_data)[num_features + cat_features]
y_test = preprocessing(test_data)['duration']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)


In [35]:
X_train[X_train.passenger_count == np.nan]

Unnamed: 0,total_amount,trip_distance,passenger_count,PULocationID,DOLocationID


In [39]:
X_train.isnull().sum()

total_amount       0
trip_distance      0
passenger_count    0
PULocationID       0
DOLocationID       0
dtype: int64

In [40]:
X_test.isnull().sum()

total_amount       0
trip_distance      0
passenger_count    0
PULocationID       0
DOLocationID       0
dtype: int64

In [99]:
X_test.loc[0].values

array([68.75,  6.68,  1.  , 97.  , 50.  ])

In [100]:
y_test[0]

np.float64(40.416666666666664)

## Baseline model

In [44]:
model = LinearRegression()
model.fit(X_train, y_train)

In [45]:
train_pred = model.predict(X_train)
root_mean_squared_error(train_pred, y_train)

6.929022561152728

In [46]:
test_pred = model.predict(X_test)
root_mean_squared_error(test_pred, y_test)

6.671806092249931

In [47]:
test_pred.mean(), test_pred.std()

(np.float64(15.068721986079776), np.float64(7.186011922929579))

## Experiments

In [48]:
import mlflow
import xgboost

In [49]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')

In [50]:
mlflow.set_experiment('experiment_1')

2025/02/10 14:59:10 INFO mlflow.tracking.fluent: Experiment with name 'experiment_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/emelidral/Dev/iml/mlruns/1', creation_time=1739174350542, experiment_id='1', last_update_time=1739174350542, lifecycle_stage='active', name='experiment_1', tags={}>

In [52]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "inclass")
    mlflow.log_param("train_data", "data/green_tripdata_2024-10.parquet")
    mlflow.log_param("test_data", "data/green_tripdata_2024-11.parquet")
    model = LinearRegression()
    model.fit(X_train, y_train)

    test_pred = model.predict(X_test)
    rmse = root_mean_squared_error(test_pred, y_test)
    mlflow.log_metric("rmse", rmse)

In [69]:
mlflow.end_run()

## Mlflow autolog

In [53]:
mlflow.sklearn.autolog()
model = Lasso(alpha=0.5)
model.fit(X_train, y_train)

2025/02/10 15:07:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '703e432e43bf482e85ac11c537a679ff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


## Hyper Parameter Optimization

In [55]:
! pip install optuna 

Collecting optuna
  Using cached optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting tqdm (from optuna)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached optuna-4.2.0-py3-none-any.whl (383 kB)
Using cached colorlog-6.9.0-py3-none-any.whl (11 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.2.0 tqdm-4.67.1


In [56]:
import optuna

In [70]:
def objective(trial):
    with mlflow.start_run():
        params = {
            #'learning_rate':trial.suggest_float('learning_rate', 1e-10, 1e10, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 20),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10),
            'objective': 'reg:squarederror'
        }
    
        model = xgboost.XGBRegressor(**params)
        model.fit(X_train, y_train)
        test_pred = model.predict(X_test)
        rmse = root_mean_squared_error(test_pred, y_test)
        mlflow.log_params(params)
        mlflow.log_metric('rmse', rmse)
        return rmse 

In [71]:
study = optuna.create_study()
study.optimize(objective, n_trials=6) 

[I 2025-02-10 15:33:07,102] A new study created in memory with name: no-name-c98e07ce-5839-47ce-bc4e-4aef63da932b
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10),
[I 2025-02-10 15:33:10,209] Trial 0 finished with value: 4.799095475534573 and parameters: {'max_depth': 8, 'reg_alpha': 1.3890786999443535e-05, 'min_child_weight': 3.2639892350382445}. Best is trial 0 with value: 4.799095475534573.
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10),
[I 2025-02-10 15:33:17,621] Trial 1 finished with value: 5.244096627656075 and parameters: {'max_depth': 20, 'reg_alpha': 0.028149554530345833, 'min_child_weight': 0.30356629651673045}. Best is trial 0 with value: 4.799095475534573.
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 0

## MLflow client

In [73]:
from mlflow.tracking import MlflowClient

In [74]:
from mlflow.entities import ViewType

In [75]:
client = MlflowClient("sqlite:///mlflow.db")

In [76]:
old_experiment = client.get_experiment_by_name("experiment_1")

In [77]:
old_experiment

<Experiment: artifact_location='/Users/emelidral/Dev/iml/mlruns/1', creation_time=1739174350542, experiment_id='1', last_update_time=1739174350542, lifecycle_stage='active', name='experiment_1', tags={}>

In [92]:
runs = client.search_runs(
    experiment_ids='1',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    filter_string='metrics.rmse < 10',
)

In [80]:
runs

[<Run: data=<RunData: metrics={'rmse': 4.727144313107067}, params={'base_score': 'None',
  'booster': 'None',
  'colsample_bylevel': 'None',
  'colsample_bynode': 'None',
  'colsample_bytree': 'None',
  'custom_metric': 'None',
  'device': 'None',
  'early_stopping_rounds': 'None',
  'eval_metric': 'None',
  'gamma': 'None',
  'grow_policy': 'None',
  'interaction_constraints': 'None',
  'learning_rate': 'None',
  'max_bin': 'None',
  'max_cat_threshold': 'None',
  'max_cat_to_onehot': 'None',
  'max_delta_step': 'None',
  'max_depth': '7',
  'max_leaves': 'None',
  'maximize': 'None',
  'min_child_weight': '1.0657267002989208',
  'monotone_constraints': 'None',
  'multi_strategy': 'None',
  'n_jobs': 'None',
  'num_boost_round': '100',
  'num_parallel_tree': 'None',
  'objective': 'reg:squarederror',
  'random_state': 'None',
  'reg_alpha': '8.676949985124119e-05',
  'reg_lambda': 'None',
  'sampling_method': 'None',
  'scale_pos_weight': 'None',
  'subsample': 'None',
  'tree_method'

In [89]:
runs[0]

<Run: data=<RunData: metrics={'rmse': 4.727144313107067}, params={'base_score': 'None',
 'booster': 'None',
 'colsample_bylevel': 'None',
 'colsample_bynode': 'None',
 'colsample_bytree': 'None',
 'custom_metric': 'None',
 'device': 'None',
 'early_stopping_rounds': 'None',
 'eval_metric': 'None',
 'gamma': 'None',
 'grow_policy': 'None',
 'interaction_constraints': 'None',
 'learning_rate': 'None',
 'max_bin': 'None',
 'max_cat_threshold': 'None',
 'max_cat_to_onehot': 'None',
 'max_delta_step': 'None',
 'max_depth': '7',
 'max_leaves': 'None',
 'maximize': 'None',
 'min_child_weight': '1.0657267002989208',
 'monotone_constraints': 'None',
 'multi_strategy': 'None',
 'n_jobs': 'None',
 'num_boost_round': '100',
 'num_parallel_tree': 'None',
 'objective': 'reg:squarederror',
 'random_state': 'None',
 'reg_alpha': '8.676949985124119e-05',
 'reg_lambda': 'None',
 'sampling_method': 'None',
 'scale_pos_weight': 'None',
 'subsample': 'None',
 'tree_method': 'None',
 'validate_parameters': 

In [None]:
type(runs[0].data.metrics['rmse'])

In [90]:
type(runs[0].info.run_id)

str

In [93]:
for run in runs:
    print(f"run_id:{run.info.run_id}, rmse:{run.data.metrics['rmse']}")

run_id:370958a8e9dd4b1db4c15d033d97005d, rmse:4.727144313107067
run_id:7c5a6121cfb14fcb90956ba3b6345f14, rmse:4.968081541786119
run_id:e7fa20b742ff4661b02eb42696d5b363, rmse:5.031111100380337
run_id:266d4e30d42b4a6fb70685b429117374, rmse:4.9575708531289
run_id:eef5837f253c4df4b936d27f3a1bda2f, rmse:5.244096627656075
run_id:351a8aca818642ecaa31f88741b25cd1, rmse:4.799095475534573
run_id:682ad9a69447431eb98111e7f170e11e, rmse:4.750436506516514
run_id:9f65833df16447bfbeb94fb88d985039, rmse:6.671806092249931


## Log model

In [94]:
study.best_params

{'max_depth': 7,
 'reg_alpha': 8.676949985124119e-05,
 'min_child_weight': 1.0657267002989208}

In [96]:
model = xgboost.XGBRegressor(**study.best_params)
model.fit(X_train, y_tra)
mlflow.xgboost.log_model(model, artifact_path='mlflow_models')



<mlflow.models.model.ModelInfo at 0x16ca96e10>