In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction import DictVectorizer


In [2]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "sqlite:///C:/Users/LENOVO/Documents/mlops-zoomcamp/mlflow.db"

In [3]:
import mlflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

mlflow.set_experiment("tracking-try")

<Experiment: artifact_location='file:///c:/Users/LENOVO/Documents/mlops-zoomcamp/mlops-zoomcamp/02-Experiment-tracking/mlruns/3', creation_time=1718029073643, experiment_id='3', last_update_time=1718029073643, lifecycle_stage='active', name='tracking-try', tags={}>

In [4]:
df = pd.read_parquet(r'C:\Users\LENOVO\Documents\mlops-zoomcamp\mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-01.parquet')
#import the val data
# val_df = pd.read_parquet(r'C:\Users\LENOVO\Documents\mlops-zoomcamp\mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-02.parquet')

df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [5]:
#create the target column for training
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
#convert duration to minutes
df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)
df['duration']

0           8.433333
1           6.316667
2          12.750000
3           9.616667
4          10.833333
             ...    
3066761    13.983333
3066762    19.450000
3066763    24.516667
3066764    13.000000
3066765    14.400000
Name: duration, Length: 3066766, dtype: float64

In [7]:
#create the target column for validation
val_df['duration'] = val_df['tpep_dropoff_datetime'] - val_df['tpep_pickup_datetime']
#convert duration to minutes
val_df['duration'] = val_df.duration.apply(lambda td: td.total_seconds() / 60)


In [6]:
#delete trips that are less then one minute or longer than 60 minutes
total_records = len(df)
new_df = df[(df.duration >= 1) & (df.duration <= 60)]
new_df_records = len(new_df)
fraction =   new_df_records / total_records
print(f'The fraction {fraction}')
# val_df = val_df[(val_df.duration >= 1) & (val_df.duration <= 60)]


The fraction 0.9812202822125979


In [7]:
category = ['PULocationID', 'DOLocationID']
new_df[category] = new_df[category].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[category] = new_df[category].astype(str)


In [10]:
val_df[category] = val_df[category].astype(str)


In [8]:
#apply one-hot encoding
train_dicts = new_df[category].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = new_df['duration'].values


In [13]:
#apply one-hot encoding on validation data
val_dicts = val_df[category].to_dict(orient='records')

#transform the val data
X_val = dv.transform(val_dicts)

#predict the duration
y_val = val_df['duration'].values



In [9]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope


In [10]:
from sklearn.metrics import mean_squared_error


In [11]:
train = xgb.DMatrix(X_train, label=y_train)
# valid = xgb.DMatrix(X_val, label=y_val)


In [7]:
# Define the objective function for hyperparameter optimization
def objective(params):
    # Start a new MLflow run
    with mlflow.start_run():
        # Set a tag for the model type
        mlflow.set_tag("model", "xgboost")
        
        # Log the hyperparameters being used
        mlflow.log_params(params)
        
        # Train the XGBoost model with the given parameters
        booster = xgb.train(
            params=params,                 # Hyperparameters for the model
            dtrain=train,                  # Training data
            num_boost_round=1000,          # Maximum number of boosting rounds
            evals=[(train, 'validation')], # Evaluation dataset and its name
            early_stopping_rounds=50       # Early stopping after 50 rounds without improvement
        )
        
        # Make predictions on the validation set
        y_pred = booster.predict(train)
        
        # Calculate the Root Mean Squared Error (RMSE) on the validation set
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log the RMSE metric to MLflow
        mlflow.log_metric("rmse", rmse)

    # Return the RMSE as the loss and the status as successful
    return {'loss': rmse, 'status': STATUS_OK}


In [15]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    # valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=20,
        evals=[(train, 'validation')],
        early_stopping_rounds=5
    )

    y_pred = booster.predict(train)
    rmse = mean_squared_error(y_train, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("model-try/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("model-try/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")




[0]	validation-rmse:9.52080
[1]	validation-rmse:9.16415
[2]	validation-rmse:8.85423
[3]	validation-rmse:8.59417
[4]	validation-rmse:8.32946
[5]	validation-rmse:8.13376
[6]	validation-rmse:7.97437
[7]	validation-rmse:7.79230
[8]	validation-rmse:7.67837
[9]	validation-rmse:7.57986
[10]	validation-rmse:7.49502
[11]	validation-rmse:7.42214
[12]	validation-rmse:7.35623
[13]	validation-rmse:7.29984
[14]	validation-rmse:7.24661
[15]	validation-rmse:7.13333
[16]	validation-rmse:7.09763
[17]	validation-rmse:7.06547
[18]	validation-rmse:7.03514
[19]	validation-rmse:6.93578




In [16]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)


In [17]:
model_name = "try-taxi"
# Retrieve all versions of the model
all_versions = client.search_model_versions(f"name='{model_name}'")

# Print out the version and stage
for version in all_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")


version: 1, stage: None


In [18]:
model_version = 1
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)


  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1718031252385, current_stage='Staging', description='', last_updated_timestamp=1718031569527, name='try-taxi', run_id='77d2b61a0f8841669426212636e8e576', run_link='', source='file:///c:/Users/LENOVO/Documents/mlops-zoomcamp/mlops-zoomcamp/02-Experiment-tracking/mlruns/3/77d2b61a0f8841669426212636e8e576/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [23]:

model = mlflow.pyfunc.load_model(f"models:/{model_name}/{new_stage}")
# model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
#y_pred = model.predict(pd.DataFrame(X_test))
model

  latest = client.get_latest_versions(name, None if stage is None else [stage])


mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 77d2b61a0f8841669426212636e8e576

In [22]:
y_pred = model.predict(train)




TypeError: Not supported type for data.<class 'xgboost.core.DMatrix'>

In [24]:
import mlflow
logged_model = 'runs:/77d2b61a0f8841669426212636e8e576/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(pd.DataFrame(train))



ValueError: DataFrame constructor not properly called!

In [25]:
# Ensure the data is aligned and contiguous
X_test = np.ascontiguousarray(train)
y_test = np.ascontiguousarray(y_train)


In [26]:
y_pred = model.predict(X_test)


ValueError: Please reshape the input data into 2-dimensional matrix.

In [2]:
import requests
from io import BytesIO
import pandas as pd
def ingest_files() -> pd.DataFrame:
    dfs = []

    for year, months in [(2024, (1, 3))]:
        for i in range(*months):
            response = requests.get(
                'https://github.com/mage-ai/datasets/raw/master/taxi/green'
                f'/{year}/{i:02d}.parquet'
            )

            if response.status_code != 200:
                raise Exception(response.text)

            df = pd.read_parquet(BytesIO(response.content))
            dfs.append(df)

    return pd.concat(dfs)

In [3]:
ingest_files()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.80,1.0,0.5,3.61,0.0,,1.0,21.66,1.0,1.0,2.75
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.30,1.0,0.5,7.11,0.0,,1.0,42.66,1.0,1.0,2.75
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,N,1.0,74,262,1.0,3.08,19.80,1.0,0.5,3.00,0.0,,1.0,28.05,1.0,1.0,2.75
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,N,1.0,74,116,1.0,2.40,14.20,1.0,1.5,0.00,0.0,,1.0,16.70,2.0,1.0,0.00
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,N,1.0,74,243,1.0,5.14,22.60,1.0,0.5,6.28,0.0,,1.0,31.38,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53572,2,2024-02-29 21:07:00,2024-02-29 21:34:00,,,223,137,,5.43,32.16,0.0,0.0,7.18,0.0,,1.0,43.09,,,
53573,2,2024-02-29 22:35:38,2024-02-29 22:46:25,,,256,37,,1.50,2.70,0.0,0.0,2.74,0.0,,1.0,6.44,,,
53574,2,2024-02-29 22:14:00,2024-02-29 22:39:00,,,75,33,,10.07,47.14,0.0,0.0,15.27,0.0,,1.0,66.16,,,
53575,2,2024-02-29 22:41:00,2024-02-29 22:53:00,,,97,33,,1.54,12.22,0.0,0.0,3.30,0.0,,1.0,16.52,,,
