# Homework 6: Best Practices

Check up: starter code

In [2]:
!ls -l

total 88
-rw-rw-r-- 1 desenfirman desenfirman  1285 Jul 31 23:12 batch.py
-rw-rw-r-- 1 desenfirman desenfirman   268 Jul 31 23:12 Dockerfile
-rw-rw-r-- 1 desenfirman desenfirman   367 Jul 31 23:15 homework.ipynb
-rw-rw-r-- 1 desenfirman desenfirman 17760 Jul 31 23:12 model.bin
-rw-rw-r-- 1 desenfirman desenfirman   201 Jul 31 23:12 Pipfile
-rw-rw-r-- 1 desenfirman desenfirman 49540 Jul 31 23:12 Pipfile.lock


## Question 1: Refactoring

In [8]:
!cat ./batch.py | tail -n +32 | head -n 60

def main(year, month, input_file, output_file):
    categorical = ['PUlocationID', 'DOlocationID']
    df = read_data(input_file, categorical=categorical)

    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')


    dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(dicts)
    y_pred = lr.predict(X_val)


    print('predicted mean duration:', y_pred.mean())


    df_result = pd.DataFrame()
    df_result['ride_id'] = df['ride_id']
    df_result['predicted_duration'] = y_pred

    df_result.to_parquet(output_file, engine='pyarrow', index=False)


if __name__ == '__main__':
    year = int(sys.argv[1])
    month = int(sys.argv[2])
    input_file = f'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet'
    output_file = f'./taxi_type=fhv_year={year:04d}_month={month:02d}_predictions.parquet'
    main(year, month, input_file, output_file)


In [9]:
!python batch.py 2021 02

predicted mean duration: 16.191691679979066


In [16]:
!ls

 batch.py	  Pipfile
 Dockerfile	  Pipfile.lock
 homework.ipynb  'taxi_type=fhv_year=2021_month=02_predictions.parquet'
 model.bin


## Question 2: Installing pytest


In [17]:
!pipenv install --dev pytest

[39m[1mCreating a virtualenv for this project...[39m[22m
Pipfile: [33m[1m/home/desenfirman/development/mlops-zoomcamp/_homework/06-best-practices/homework/Pipfile[39m[22m
[39m[1mUsing[39m[22m [33m[1m/home/desenfirman/.pyenv/versions/3.9.9/bin/python3.9[39m[22m [32m[22m(3.9.9)[39m[22m [39m[1mto create virtualenv...[39m[22m
⠼[0m Creating virtual environment...[K[36m[22mcreated virtual environment CPython3.9.9.final.0-64 in 1896ms
  creator CPython3Posix(dest=/home/desenfirman/.local/share/virtualenvs/homework-GkJuXekF, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/home/desenfirman/.local/share/virtualenv)
    added seed packages: pip==22.0.4, setuptools==62.1.0, wheel==0.37.1
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
[39m[22m
[K[?25h[32m[22m✔ Successfully created virtual environmen

In [1]:
!mkdir tests && touch tests/test_batch.py && touch tests/__init__.py

## Question 3: Writing first unit test

In [14]:
!cat tests/test_batch.py 

import pandas as pd
from datetime import datetime
from batch import prepare_data

def dt(hour, minute, second=0):
    return datetime(2021, 1, 1, hour, minute, second)


def test_prepare_data():
    data = [
        (None, None, dt(1, 2), dt(1, 10)),
        (1, 1, dt(1, 2), dt(1, 10)),
        (1, 1, dt(1, 2, 0), dt(1, 2, 50)),
        (1, 1, dt(1, 2, 0), dt(2, 2, 1)),        
    ]

    excepted_row_count = 2

    columns = ['PUlocationID', 'DOlocationID', 'pickup_datetime', 'dropOff_datetime']
    df = pd.DataFrame(data, columns=columns)
    df = prepare_data(df, categorical=['PUlocationID', 'DOlocationID'])
    assert df.shape[0] == excepted_row_count

In [15]:
!pytest 

platform linux -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0
rootdir: /home/desenfirman/development/mlops-zoomcamp/_homework/06-best-practices/homework
plugins: cov-3.0.0, anyio-3.5.0, mock-1.13.0
collected 1 item                                                               [0m[1m

tests/test_batch.py [32m.[0m[32m                                                    [100%][0m



It's 2

## Question 4: Mocking S3 with Localstack

In [16]:
!touch docker-compose.yml

In [17]:
!cat docker-compose.yml

version: "3.7"

services:
  localstack:
    image: localstack/localstack
    ports:
      - "4566:4566"
    environment:
      SERVICES: s3


In [18]:
!docker-compose up -d

Creating network "homework_default" with the default driver
Pulling localstack (localstack/localstack:)...
latest: Pulling from localstack/localstack

[1Bb615420c: Already exists
[1Bf1d7987a: Already exists
[1B25cbf621: Pulling fs layer
[1Ba7c10346: Pulling fs layer
[1B1f8fbfcd: Pulling fs layer
[1B14a39634: Pulling fs layer
[1B115d48ea: Pulling fs layer
[1Bfc7bed51: Pulling fs layer
[1B254cd150: Pulling fs layer
[1Ba1cd7148: Pulling fs layer
[1Bd0011be9: Pulling fs layer
[1Bd3e77c3f: Pulling fs layer
[1Bc0b617bc: Pulling fs layer
[1B110a9e83: Pulling fs layer
[1Bb700ef54: Pulling fs layer
[1B54aaa182: Pulling fs layer
[1B5055c914: Pulling fs layer
[1Ba8f378fa: Pulling fs layer
[1B141f0a71: Pulling fs layer
[1Ba32aab33: Pulling fs layer
[1B871a0806: Pulling fs layer
[1Bb9d150d2: Pulling fs layer
[1B0fa1ec02: Pulling fs layer
[1Bee0ced06: Pulling fs layer
[1B6e295841: Pulling fs layer
[1Bf5b29e2a: Pulling fs layer
[1Badca8e3c: Pulling fs layer
[1B87b4b5f7: Pu

In [19]:
!aws s3 mb s3://nyc-duration


/bin/bash: line 1: aws: command not found


In [20]:
!docker ps | grep localstac

8d4b8b9d4f87   localstack/localstack                                           "docker-entrypoint.sh"   7 minutes ago   Up 7 minutes (healthy)   4510-4559/tcp, 5678/tcp, 0.0.0.0:4566->4566/tcp, :::4566->4566/tcp   homework_localstack_1


In [23]:
!docker exec -t homework_localstack_1 sh -c 'awslocal s3 mb s3://nyc-duration'

make_bucket: nyc-duration


In [24]:
!docker exec -t homework_localstack_1 sh -c 'awslocal s3 ls'

2022-07-31 17:15:59 nyc-duration


In [25]:
!export INPUT_FILE_PATTERN="s3://nyc-duration/in/{year:04d}-{month:02d}.parquet"
!export OUTPUT_FILE_PATTERN="s3://nyc-duration/out/{year:04d}-{month:02d}.parquet"

Modify the `batch.py`

In [27]:
!cat ./batch.py | tail -n +50 | head -n 60


    df_result.to_parquet(output_file, engine='pyarrow', index=False)

def get_input_path(year, month):
    default_input_pattern = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/nyc-tlc/fhv/fhv_tripdata_{year:04d}-{month:02d}.parquet'
    input_pattern = os.getenv('INPUT_FILE_PATTERN', default_input_pattern)
    return input_pattern.format(year=year, month=month)


def get_output_path(year, month):
    default_output_pattern = 's3://nyc-duration-prediction-alexey/taxi_type=fhv/year={year:04d}/month={month:02d}/predictions.parquet'
    output_pattern = os.getenv('OUTPUT_FILE_PATTERN', default_output_pattern)
    return output_pattern.format(year=year, month=month)


if __name__ == '__main__':
    year = int(sys.argv[1])
    month = int(sys.argv[2])
    input_file = get_input_path(year, month)
    output_file = get_output_path(year, month) 
    main(year, month, input_file, output_file)
