# Housing prices Pachyderm pipeline

In [1]:
cd ..

/home/ubuntu/dvoitekh/aihouse_mlops_course/session3/pachyderm


## Optional. Build Docker image and push to thre registry
If you don't want to build your own image just proceed to the next step to use already available image

In [4]:
!eval $(minikube docker-env)
!docker build -t housing-prices-pachyderm:0.0.1 .

### split data into 2 parts

In [5]:
import pandas as pd

data = pd.read_parquet('../data/house_dataset_main.parquet')
data1 = data[:len(data) // 2]
data2 = data[len(data) // 2:]
data1.to_parquet('housing1.parquet', index=False)
data2.to_parquet('housing2.parquet', index=False)

## Create a data repo and fill it with data

In [6]:
!pachctl create repo housing_data

In [7]:
!pachctl put file housing_data@master:housing1.parquet -f housing1.parquet



In [8]:
!pachctl list file housing_data@master

NAME              TYPE SIZE     
/housing1.parquet file 543.2KiB 


## Deploy 2 pipelines that are executed sequentially:
1. Feature generation pipeline
2. EDA pipeline

In [9]:
!pachctl create pipeline -f manifests/extract_features_pipeline.json

In [10]:
!pachctl create pipeline -f manifests/eda_pipeline.json

Pipelines repos are automatically created

In [11]:
!pachctl list repo

NAME                     CREATED       SIZE (MASTER) DESCRIPTION                                        
housing_eda              1 second ago  ≤ 0B          Output repo for pipeline housing_eda.              
housing_extract_features 2 seconds ago ≤ 0B          Output repo for pipeline housing_extract_features. 
housing_data             5 seconds ago ≤ 543.2KiB                                                       


In [13]:
!pachctl list pipeline

NAME                     VERSION INPUT                      CREATED       STATE / LAST JOB   DESCRIPTION                                                   
housing_eda              1       housing_extract_features:/ 5 seconds ago [32mrunning[0m / [33mstarting[0m A pipeline that performs EDA for the housing dataset features 
housing_extract_features 1       housing_data:/             6 seconds ago [32mrunning[0m / [33mrunning[0m  A pipeline that preprocesses housing dataset                  


### View pipeline logs

Check k8s workers that were created

In [14]:
!kubectl get po -n pachyderm | grep housing

pipeline-housing-eda-v1-d2zpj                2/2     Running   0          9s
pipeline-housing-extract-features-v1-h7dcc   2/2     Running   0          10s


and k8s logs

In [10]:
!kubectl logs pipeline-housing-extract-features-v1-gcdcc -n pachyderm

logs in a more friendly format

In [15]:
!pachctl logs --pipeline=housing_extract_features

start feature generation job
input files: ['/pfs/housing_data/housing1.parquet']


In [16]:
!pachctl logs --pipeline=housing_eda

start eda job
input files: ['/pfs/housing_extract_features/features_2a1bd07439054ccf6ef95e8149a31386.parquet']


In [17]:
!pachctl list jobs -p housing_eda

PIPELINE    ID                               STARTED        DURATION RESTART PROGRESS  DL UL STATE   
housing_eda f78c9231a1f1405890e09f96bb15da9a 11 seconds ago -        0       0 + 0 / 1 0B 0B [33mrunning[0m 


In [18]:
!pachctl list file housing_extract_features@master

NAME                                               TYPE SIZE     
/features_2a1bd07439054ccf6ef95e8149a31386.parquet file 397.1KiB 


In [20]:
!pachctl list files housing_eda@master

NAME                                               TYPE SIZE     
/eda_profile_6d3be99c29794813b7659abad2e41e2c.html file 3.082MiB 


### Let's check the preprocessed dataset

In [24]:
!pachctl get file housing_extract_features@master --recursive --output .

In [25]:
import pandas as pd
pd.read_parquet('features_2a1bd07439054ccf6ef95e8149a31386.parquet').head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedHouseVal
0,2.4792,24.0,3.454704,1.134146,2251.0,3.921603,2.0
1,3.463,8.0,6.363636,1.166297,1307.0,2.898004,2.017
2,3.75,16.0,5.768719,1.023295,1478.0,2.459235,1.473
3,2.8542,34.0,3.858779,1.045802,1164.0,4.442748,1.469
4,1.3375,18.0,4.567625,1.087327,2707.0,2.882854,0.596


### We can check the EDA report now

In [27]:
!pachctl get file housing_eda@master --recursive --output .

In [46]:
from IPython.core.display import display, HTML
from pathlib import Path
html = Path('eda_profile_6d3be99c29794813b7659abad2e41e2c.html').read_text()
display(HTML(html))

### Also, let's see commits and data that is included

In [21]:
!pachctl list commit housing_eda@master

REPO        BRANCH COMMIT                           FINISHED       SIZE     ORIGIN DESCRIPTION
housing_eda master f78c9231a1f1405890e09f96bb15da9a 24 seconds ago 3.082MiB AUTO    


In [23]:
!pachctl list commit f78c9231a1f1405890e09f96bb15da9a

REPO                          BRANCH COMMIT                           FINISHED       SIZE     ORIGIN DESCRIPTION
housing_eda.spec              master f78c9231a1f1405890e09f96bb15da9a 53 seconds ago 0B       USER    
housing_extract_features.spec master f78c9231a1f1405890e09f96bb15da9a 53 seconds ago 0B       ALIAS   
housing_data                  master f78c9231a1f1405890e09f96bb15da9a 53 seconds ago 543.2KiB ALIAS   
housing_eda                   master f78c9231a1f1405890e09f96bb15da9a 32 seconds ago 3.082MiB AUTO    
housing_eda.meta              master f78c9231a1f1405890e09f96bb15da9a 32 seconds ago 3.47MiB  AUTO    
housing_extract_features.meta master f78c9231a1f1405890e09f96bb15da9a 47 seconds ago 941KiB   ALIAS   
housing_extract_features      master f78c9231a1f1405890e09f96bb15da9a 47 seconds ago 397.1KiB ALIAS   


### Let's add another data file

In [24]:
!pachctl put file housing_data@master:housing2.parquet -f housing2.parquet



In [25]:
!pachctl list file housing_data@master

NAME              TYPE SIZE     
/housing1.parquet file 543.2KiB 
/housing2.parquet file 540.2KiB 


In [35]:
!pachctl logs --pipeline=housing_extract_features

start feature generation job
input files: ['/pfs/housing_data/housing1.parquet']
start feature generation job
input files: ['/pfs/housing_data/housing1.parquet', '/pfs/housing_data/housing2.parquet']


In [26]:
!pachctl list job -p housing_extract_features

PIPELINE                 ID                               STARTED        DURATION           RESTART PROGRESS  DL       UL       STATE   
housing_extract_features e40d882420e94d2f8950ce1bda65c489 2 seconds ago  1 second           0       1 + 0 / 1 1.058MiB 740.7KiB [32msuccess[0m 
housing_extract_features 23a9c1d8b35c456392f9d9df69d2669b 53 seconds ago Less than a second 0       1 + 0 / 1 543.2KiB 397.1KiB [32msuccess[0m 


In [27]:
!pachctl list jobs -p housing_eda

PIPELINE    ID                               STARTED        DURATION   RESTART PROGRESS  DL       UL       STATE   
housing_eda e40d882420e94d2f8950ce1bda65c489 3 seconds ago  -          0       0 + 0 / 1 0B       0B       [33mrunning[0m 
housing_eda f78c9231a1f1405890e09f96bb15da9a 54 seconds ago 14 seconds 0       1 + 0 / 1 397.1KiB 3.082MiB [32msuccess[0m 


In [28]:
!pachctl list file housing_extract_features@master

NAME                                               TYPE SIZE     
/features_4f86d687113ab5b0c3984dc49a27b8c0.parquet file 740.7KiB 


In [30]:
!pachctl list file housing_extract_features@master

NAME                                               TYPE SIZE     
/features_4f86d687113ab5b0c3984dc49a27b8c0.parquet file 740.7KiB 


In [33]:
!pachctl get file housing_extract_features@master --recursive --output .

In [34]:
import pandas as pd
data = pd.read_parquet('features_4f86d687113ab5b0c3984dc49a27b8c0.parquet')
print(len(data))
data.head()

20640


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedHouseVal
0,2.4792,24.0,3.454704,1.134146,2251.0,3.921603,2.0
1,3.463,8.0,6.363636,1.166297,1307.0,2.898004,2.017
2,3.75,16.0,5.768719,1.023295,1478.0,2.459235,1.473
3,2.8542,34.0,3.858779,1.045802,1164.0,4.442748,1.469
4,1.3375,18.0,4.567625,1.087327,2707.0,2.882854,0.596
