# ML Pipeline

V3 : simple pipeline with few features

In [53]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import os
import json
import pickle
import pandas as pd
import geopandas as gpd
from region_new import Region

In [55]:
path = os.path.join("data","ml", "Ordu.pkl")
with open(path, 'rb') as f:     
    data = pickle.load(f)
data.head(10)

Unnamed: 0,height,orient,type_level_0,density,area,source,geometry,type_level_1,type_level_2
0,3.0,57.0,,1,589.724363,OSM,"POLYGON ((37.85878 41.01506, 37.85899 41.01504...",,
1,3.0,125.0,religious,1,517.641569,OSM,"POLYGON ((37.85890 41.01424, 37.85909 41.01425...",religious,others
2,3.0,52.0,Other buildings,1,150.135975,OSM,"POLYGON ((37.86598 41.00785, 37.86600 41.00785...",residential,residential
3,3.0,67.0,Other buildings,1,907.775199,OSM,"POLYGON ((37.86608 41.00655, 37.86608 41.00655...",residential,residential
4,3.0,41.0,Other buildings,1,1869.903589,OSM,"POLYGON ((37.86631 41.00708, 37.86631 41.00708...",residential,residential
5,3.0,49.0,Other buildings,1,1609.714757,OSM,"POLYGON ((37.86614 41.00693, 37.86619 41.00693...",residential,residential
6,3.0,55.0,Other buildings,1,384.167994,OSM,"POLYGON ((37.86624 41.00885, 37.86622 41.00875...",residential,residential
7,3.0,56.0,Other buildings,1,283.882126,OSM,"POLYGON ((37.86609 41.00870, 37.86612 41.00869...",residential,residential
8,3.0,65.0,Other buildings,1,1151.967331,OSM,"POLYGON ((37.86638 41.00979, 37.86616 41.00982...",residential,residential
9,3.0,70.0,Other buildings,1,275.170659,OSM,"POLYGON ((37.86623 41.00934, 37.86623 41.00934...",residential,residential


First, we need to ensure the dataset contains all features as it results from different sources. 

In [56]:
from ml.add_all_features import *
data = add_all_features(data,  'Ordu')

adding polygons... 
adding denstity... 
adding area... 
adding rasters... 


In [57]:
def split( data ):
        data_train = data.dropna(subset='height')
        data_test = pd.concat(
                [data, data_train]
                ).drop_duplicates(keep=False)
        return data_train, data_test

In [58]:
data, test = split(data)
data.head()

Unnamed: 0,height,orient,type_level_0,density,area,source,geometry,type_level_1,type_level_2,ntl,index_right,lcz
0,3.0,57.0,,1,589.724363,OSM,"POLYGON ((37.85878 41.01506, 37.85899 41.01504...",,,10.014004,32.0,0.0
0,3.0,57.0,,1,589.724363,OSM,"POLYGON ((37.85878 41.01506, 37.85899 41.01504...",,,10.014004,6.0,12.0
0,3.0,57.0,,1,589.724363,OSM,"POLYGON ((37.85878 41.01506, 37.85899 41.01504...",,,11.764342,32.0,0.0
0,3.0,57.0,,1,589.724363,OSM,"POLYGON ((37.85878 41.01506, 37.85899 41.01504...",,,11.764342,6.0,12.0
1,3.0,125.0,religious,1,517.641569,OSM,"POLYGON ((37.85890 41.01424, 37.85909 41.01425...",religious,others,10.014004,6.0,12.0


In [59]:
save = data
save_test = test
data.drop(['geometry', 'source', 'orient'], axis=1, inplace=True)
test.drop(['geometry', 'source', 'orient'], axis=1, inplace=True)

The actual pipeline : scale, one-hot-encode categories and fill NaN values

In [60]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier

In [61]:
y = data['height']
X = data.drop('height', axis=1)

In [62]:
numerical_features = ['ntl', 'area']
categorical_features = make_column_selector(dtype_include = object)
discrete_features = ['lcz', 'index_right']

In [87]:
categorical_feature_names = [
    X['type_level_0'].unique(), 
    (X['type_level_1']).unique(), 
    (X['type_level_2']).unique(), 
]
categorical_feature_names

[array([nan, 'religious', 'Other buildings', 'school', 'residential',
        'Accommodation', 'office', 'stadium', 'roof', 'commercial',
        'dormitory', 'public', 'mall', 'hospital', 'greenhouse', 'civic',
        'industrial', 'university', 'kindergarten'], dtype=object),
 array([nan, 'religious', 'residential', 'school', 'civic', 'sports',
        'commercial', 'agricultural', 'industrial'], dtype=object),
 array([nan, 'others', 'residential', 'commercial', 'industrial'],
       dtype=object)]

In [91]:
numerical_pipeline = make_pipeline(
    SimpleImputer( strategy='mean'),
    StandardScaler(),
    )
categorical_pipeline = make_pipeline(
    SimpleImputer( strategy='most_frequent'),
    OneHotEncoder(categories=categorical_feature_names)
)
discrete_pipeline = make_pipeline(
    SimpleImputer( strategy='most_frequent'),
    StandardScaler(),
)

In [92]:
preprocessor = make_column_transformer( 
    (numerical_pipeline, numerical_features),
    (categorical_pipeline, categorical_features),
    (discrete_pipeline, discrete_features)
)

In [93]:
model = make_pipeline( 
    preprocessor,
    SGDClassifier()
    )
model.fit(X,y)

Testing with SGDClassifier

In [None]:
X_test = test.drop('height', axis=1)

model.predict(X_test, transform=preprocessor)

`Need to fix one-hot encoding for building types`

**Grid Search** : Model comparison and Cross Validation