In [2]:
!python -V

Python 3.11.0


In [1]:
import pickle
import pandas as pd
import sklearn
import numpy as np
import 

In [2]:
year = 2023
month = 3

input_file = f'data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'

In [3]:
!mkdir output

mkdir: output: File exists


In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data(input_file)
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [7]:
df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0,2023/03_0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,2023/03_1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667,2023/03_2


In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1. Notebook

In [15]:
print(f"The standard deviation of the predicted duration {np.std(y_pred):.2f}")

The standard deviation of the predicted duration 6.25


## Q2. Preparing the output

In [10]:
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['predicted_duration'] = y_pred

In [11]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [12]:
!ls -lh output

total 133448
-rw-r--r--  1 emmanuella  staff    65M Nov 18 20:06 yellow_tripdata_2023-03.parquet


## Q3. Creating the scoring script

In [None]:
# jupyter nbconvert --to script score.ipynb or
# jupyter nbconvert --to script score.ipynb --output-dir ./scripts/

In [1]:
import sklearn
print(sklearn.__version__)


1.5.0


## Q4. Virtual environment

In [1]:
print('The first hash of the Scikit-Learn dependency is sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c')

The first hash of the Scikit-Learn dependency is sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c


## Q5. Parametrize the script

In [None]:
print('The mean predicted duration is 14.29 ')