## 1 Verify environment

Note: Install spark-sklearn and scikit-learn==0.19.2 on the remote cluster

In [None]:
from databrickslabs_jupyterlab import is_remote

if is_remote():
    display(spark)
else:
    spark = None

## 2 Model development

### Configuration and import for both use cases, remote and local

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets

if is_remote():
    import spark_sklearn
    
    def GridSearchCV(*args, **kwargs):
        return spark_sklearn.GridSearchCV(sc, *args, **kwargs)
    
    print("Remote distributed execution")

    data_path = "/dbfs/data/digits/digits.csv"

    param_grid = {
        "max_depth": [3, None],
        "max_features": [1, 3, 10],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 3, 10],
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"],
        "n_estimators": [10, 20, 40, 80]
    }
    
else:
    from sklearn.model_selection import GridSearchCV
    
    print("Local execution")

    data_path = "~/Data/digits/digits.csv"

    param_grid = {
        "max_depth": [3, None],
        "max_features": [1, 3],
        "min_samples_split": [2, 10],
        "min_samples_leaf": [1, 10],
        "n_estimators": [10, 20, 40]
    }

### Model building and grid search of hyperparameters

In [None]:
from functools import reduce
param_grid_size = reduce(lambda x,y: x*y, [len(p) for p in param_grid.values()], 1)
print("Parameter grid size:", param_grid_size)

In [None]:
df = pd.read_csv(data_path)
X = df.loc[:, df.columns != 'target'].values
y = df["target"].values

cv = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
cv.fit(X,y)