In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.7.0


In [2]:
!python -V

Python 3.11.6


In [17]:
import pickle
import pandas as pd

In [26]:
year = 2023
month = 3

In [23]:
def load_artifact():

    with open('/Users/eliasdzobo/Desktop/2025/mlops-2025/model.pkl', 'rb') as f_in:
        model = pickle.load(f_in)

    with open('/Users/eliasdzobo/Desktop/2025/mlops-2025/dict_vectorizer.pkl', 'rb') as f_in:
        dv = pickle.load(f_in)

    return model, dv

In [19]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [29]:
def apply_model(year, month, metrc='mean'):
    df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet')
    print(f"Data loaded for {year}-{month:02d}")
    model, dv = load_artifact()
    print("Model and DictVectorizer loaded")
    dicts = df[categorical].to_dict(orient='records')
    print("Data transformed to dicts for prediction")
    X_val = dv.transform(dicts)
    print("Data vectorized for model input")
    y_pred = model.predict(X_val)
    print("Predictions made using the model")

    if metrc == 'mean':
        mean_pred = y_pred.mean()
        print(f"Mean predicted duration: {mean_pred:.2f}")
    elif metrc == 'std':
        std_pred = y_pred.std()
        print(f"Standard deviation of predicted duration: {std_pred:.2f}")
    return df, y_pred

In [30]:
def save_results(df, y_pred, output_file):

    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype(str)
    df_result = pd.DataFrame({
        'ride_id': df['ride_id'],
        'predicted_duration': y_pred
    })

    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )

In [31]:
def main(year, month):
    df, y_pred = apply_model(year, month, metrc='mean')
    save_results(df, y_pred, f'{year:04d}-{month:02d}_predictions.parquet')


In [32]:
main(2023, 5)

Data loaded for 2023-05
Model and DictVectorizer loaded
Data transformed to dicts for prediction
Data vectorized for model input
Predictions made using the model
Mean predicted duration: 15.04
