## MLeap Scikit-Learn Demo

The goal of this demo is to:
    1. Put together an ML pipeline using scikit transformers, pipeline and feature unions
    2. Train a linear regression to predict listing prices
    3. Demonstrate how to serialize scikit-learn transformers and models to bundle.ml
    4. TODO: use .deploy() to deploy a model to combust cloud
    5. TODO: deserialize the pipeline in Spark
    
Note: MLeap <> Scikit-Learn itegration is experimental. We are planning to release a stable version with mleap-0.6.0

## Background on the Dataset

The dataset used for the demo was pulled together from individual cities' data found [here](http://insideairbnb.com/get-the-data.html). We've also gone ahead and pulled the individual datasets and relevant features into this [research dataset](https://s3-us-west-2.amazonaws.com/mleap-demo/datasources/airbnb.avro.zip) stored as avro.

### Step 0: Load libraries and data

In [None]:
import uuid

import sys
import pandas as pd
# Make sure to checkout the feature/scikit-v2 branch
sys.path.append('/Users/mikhail/combust/combust-mleap/python')

import mleap.sklearn.pipeline
import mleap.sklearn.feature_union
import mleap.sklearn.base
import mleap.sklearn.logistic
import mleap.sklearn.preprocessing.data
from mleap.sklearn.ensemble import forest

from mleap.sklearn.preprocessing.data import FeatureExtractor, NDArrayToDataFrame, ToDense

from sklearn.linear_model import LinearRegression
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import numpy as np

In [None]:
df = pd.read_csv('/Users/mikhail/combust/mleap-demo/data/airbnb/airbnb.csv', error_bad_lines=False, warn_bad_lines=False)


### Step 1: Standardize the data for out demo

In [None]:
def _transform_state(state):
    if state in ['NY', 'CA', 'London', 'Berlin', 'TX', 'IL', 'OR', 'DC', 'WA']:
        return state
    return 'Other'


### Step 1.1: Take a look at some summary statistics of the data

In [None]:
df[['state', 'price']].groupby('state').agg([np.size, np.mean]).sort_values(by=('price', 'size'), ascending=False)[:10]

In [None]:
price_stats=df[['state', 'price']].groupby('state').agg([np.size, np.mean, np.max]).sort_values(by=('price', 'mean'), ascending=False)
price_stats[price_stats[('price','size')]>25][:10]

In [None]:
# convert to categorical feature
df['host_is_superhost'] = df['host_is_superhost'].apply(str)
df['instant_bookable'] = df['instant_bookable'].apply(str)

# normalize state
df['state'] = df.state.apply(_transform_state)


### Step 2: Define continous and categorical features and filter nulls

In [None]:
continuous_features = ["bathrooms",
  "bedrooms",
  "security_deposit",
  "cleaning_fee",
  "extra_people",
  "number_of_reviews",
  "square_feet",
  "review_scores_rating"]

categorical_features = ["room_type",
  "host_is_superhost",
  "cancellation_policy",
  "state",
  "instant_bookable"]


In [None]:
imputed_continuous_features = ['imp_{}'.format(x) for x in continuous_features]

feature_extractor2_tf = FeatureExtractor(continuous_features, 'imputed_features', imputed_continuous_features)

impute_security_deposit_tf = Imputer(strategy='mean', axis=0)
impute_security_deposit_tf.minit(input_features=feature_extractor2_tf.output_vector, output_features='imputed_features')

impute_pipeline = Pipeline([
        (feature_extractor2_tf.name, feature_extractor2_tf),
        (impute_security_deposit_tf.name, impute_security_deposit_tf)
    ])
impute_pipeline.minit()

# Consider doing this via a feature union
df2 = df.join(pd.DataFrame(impute_pipeline.fit_transform(df), columns=feature_extractor2_tf.output_vector_items))

all_features = imputed_continuous_features + categorical_features

### Step 3: Split data into training and validation 

In [None]:
# First filter out outlier prices
df2 = df2[(df2.price>=50)&(df2.price<=500)]

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(df2[all_features], df2[['price']], test_size=0.33, random_state=42)

### Step 4: Continous Feature Pipeline

In [None]:
feature_extractor_tf = FeatureExtractor(imputed_continuous_features, 'unscaled_cont_features', ["scaled_{}".format(x) for x in imputed_continuous_features])

standard_scaler_tf = StandardScaler()
standard_scaler_tf.minit(input_features=feature_extractor_tf.output_vector, output_features='scaled_cont_features')

standard_scaler_pipeline = Pipeline([(feature_extractor_tf.name, feature_extractor_tf),
                            (standard_scaler_tf.name, standard_scaler_tf)])
standard_scaler_pipeline.minit()

### Step 5: Categorical Feature Pipeline

In [None]:
# TODO: Need to fix scikit's One-Hot-Encoder to drop the last column of a matrix if we're using it for ML
def _create_le_one_hot_pipeline(feature_name):
    feature_extractor3_tf = FeatureExtractor([feature_name], '{}_label'.format(feature_name), 
                                         ['{}_label_encoded'.format(feature_name)])

    # Label Encoder for x1 Label 
    label_encoder_tf = LabelEncoder()
    label_encoder_tf.minit(input_features = feature_extractor3_tf.output_vector, output_features='{}_label_le'.format(feature_name))

    # Convert output of Label Encoder to Data Frame instead of 1d-array
    n_dim_array_to_df_tf = NDArrayToDataFrame(feature_extractor3_tf.output_vector_items)

    # Vector Assembler for x1 One Hot Encoder
    one_hot_encoder_tf = OneHotEncoder()
    one_hot_encoder_tf.minit(input_features = label_encoder_tf.output_features, output_features = '{}_label_one_hot_encoded'.format(feature_name))

    #To Dense
    to_dense_tf = ToDense(one_hot_encoder_tf.output_features)

    one_hot_encoder_pipeline_x0 = Pipeline([
                                             (feature_extractor3_tf.name, feature_extractor3_tf),
                                             (label_encoder_tf.name, label_encoder_tf),
                                             (n_dim_array_to_df_tf.name, n_dim_array_to_df_tf),
                                             (one_hot_encoder_tf.name, one_hot_encoder_tf),
                                             (to_dense_tf.name, to_dense_tf)
                                            ])
    
    one_hot_encoder_pipeline_x0.minit()
    
    return one_hot_encoder_pipeline_x0

In [None]:
oh_pipelines = [_create_le_one_hot_pipeline(x) for x in categorical_features]

### Step 6: Assemble our features and feature pipeline

In [None]:
feature_union = FeatureUnion([
        (standard_scaler_pipeline.name, standard_scaler_pipeline)
    ] + [(x.name, x) for x in oh_pipelines])
feature_union.minit()

In [None]:
standard_scaler_pipeline.fit_transform(df2)

### Step 7: Define our linear regression model

In [None]:
# Put all of the categorical features into a list
oh_features_lists = [[y[1].output_features for y in x.steps if y[1].op == 'one_hot_encoder'] for x in oh_pipelines]
oh_features = [item for sublist in oh_features_lists for item in sublist]
oh_features

In [None]:
# Vector Assembler, for serialization purposes only
feature_extractor_lr_model_tf = FeatureExtractor([standard_scaler_tf.output_features] + oh_features, 'input_features', [standard_scaler_tf.output_features] + oh_features)
feature_extractor_lr_model_tf.skip_fit_transform = True

# Define our linear regression
lr_model = LinearRegression()
lr_model.minit(input_features='input_features', prediction_column='price_prediction')

lr_model_pipeline = Pipeline([
        (feature_extractor_lr_model_tf.name, feature_extractor_lr_model_tf),
        (lr_model.name, lr_model)
    ])
lr_model_pipeline.minit()

In [None]:
model_pipeline = Pipeline([(feature_union.name, feature_union),
                            (lr_model_pipeline.name, lr_model_pipeline)])

model_pipeline.minit()

### Step 9: Define our Random Forest Regression Model

In [None]:
# Vector Assembler, for serialization purposes only
feature_extractor_rf_model_tf = FeatureExtractor(imputed_continuous_features, 'input_features', imputed_continuous_features)
feature_extractor_rf_model_tf.skip_fit_transform = True


rf = RandomForestRegressor(max_depth=4, n_estimators=11)
rf.minit(input_features=feature_extractor_rf_model_tf.output_vector, prediction_column='price_prediction', feature_names=imputed_continuous_features)

rf_model_pipeline = Pipeline([
        (feature_extractor_rf_model_tf.name, feature_extractor_rf_model_tf),
        (rf.name, rf)
    ])
rf_model_pipeline.minit()


In [None]:
rf_model_pipeline.fit(X_train[imputed_continuous_features], y_train)

### Step 8: Fit our pipeline and regression


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
params = {
    "{}__max_depth".format(rf.name): [5, 10],
    "{}__n_estimators".format(rf.name): [10, 15, 20]
}

rf_grid = GridSearchCV(estimator=rf_model_pipeline, param_grid=params, n_jobs=-1)
rf_grid.fit(X_train[imputed_continuous_features], y_train)

In [None]:
best_rf = rf_grid.best_params_
best_max_depth = best_rf["{}__max_depth".format(rf.name)]
best_n_estimators = best_rf["{}__n_estimators".format(rf.name)]

rf = RandomForestRegressor(max_depth=best_max_depth, n_estimators=best_n_estimators)
rf.minit(input_features=feature_extractor_rf_model_tf.output_vector, prediction_column='price_prediction', feature_names=imputed_continuous_features)

rf_model_pipeline = Pipeline([
        (feature_extractor_rf_model_tf.name, feature_extractor_rf_model_tf),
        (rf.name, rf)
    ])
rf_model_pipeline.minit()

In [None]:
model_pipeline.fit(X_train, y_train)
rf_model_pipeline.fit(X_train[imputed_continuous_features], y_train)

### Step 9: Serialize our pipelines to bundle.ml

In [None]:
model_pipeline.serialize_to_bundle('/tmp', 'scikit-airbnb.lr', init=True)
rf_model_pipeline.serialize_to_bundle('/tmp', 'scikit-airbnb.rf', init=True)