In [2]:
!pip install pandas scikit-learn pyarrow fastparquet



In [3]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [4]:
!python -V

Python 3.12.3


In [5]:
import pickle
import pandas as pd

In [6]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [7]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [8]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [9]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [10]:
import numpy as np

In [11]:
y_pred = model.predict(X_val)

In [12]:
std_dev = np.std(y_pred)
print(f"Standard deviation of predicted duration: {std_dev:.2f}")

Standard deviation of predicted duration: 6.25


In [13]:
# Create an artificial ride_id column
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [14]:
# Create a results dataframe with ride_id and predictions
df_result = pd.DataFrame({'ride_id': df['ride_id'], 'predicted_duration': y_pred})

In [15]:
# Define the output file name
output_file = 'results.parquet'

# Save the results to a parquet file
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [16]:
# Get the size of the output file
import os
file_size = os.path.getsize(output_file)
print(f"Output file size: {file_size:.2f} KB")

results_file_size = file_size / 1024
print(results_file_size)

Output file size: 68641880.00 KB
67033.0859375
