#### Question 1. Notebook (1 point)

6.24

#### Question 2. Preparing the output (1 point)

66M

#### Question 3. Creating the scoring script (1 point)

In [None]:
%jupyter nbconvert --to script starter.ipynb

#### Question 4. Virtual environment. Hash for Scikit-Learn (1 point)

sha256:23fb9e74b813cc2528b5167d82ed08950b11106ccf50297161875e45152fb311

#### Question 5. Parametrize the script (1 point)

taxi_pred.py

In [None]:
#!/usr/bin/env python
# coding: utf-8

import argparse
import pickle

import numpy as np
import pandas as pd

categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df

def main(year, month):
    with open('model.bin', 'rb') as f_in:
        dv, model = pickle.load(f_in)

    filename = f'./yellow_tripdata_{year}-{month:02d}.parquet'

    df = read_data(filename)

    dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(dicts)

    y_pred = model.predict(X_val)

    mean_pred_duration = np.mean(y_pred)
    print(f'Mean predicted duration for {year}/{month:02d}: {mean_pred_duration:.2f} minutes')

    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    df_result = pd.DataFrame({
        'ride_id': df['ride_id'],
        'predicted_duration': y_pred
    })

    output_file = f'./predicted_{year}_{month:02d}.parquet'

    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Predict taxi ride durations')
    parser.add_argument('--year', type=int, required=True, help='Year')
    parser.add_argument('--month', type=int, required=True, help='Month of the dataset')
    args = parser.parse_args()

    main(args.year, args.month)


#### Question 6. Docker container (1 point)
dockerfile

FROM agrigorev/zoomcamp-model:mlops-2024-3.10.13-slim

WORKDIR /app

COPY taxi_pred.py .

COPY requirements.txt .

COPY yellow_tripdata_2023-05.parquet .

RUN pip install -r requirements.txt && pip install pyarrow

CMD ["python", "taxi_pred.py", "--year", "2023", "--month", "5"]
