In [1]:
!pip3 install sklearn

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Note that this code must be run from python3 or ipython3 in a session's 
# terminal, not run directly in the graphical console. See
# https://github.com/dask/dask/issues/4612

import cdsw_dask_utils
import cdsw
import numpy as np
import pandas as pd

In [3]:
# #1. Load the data (From File )
input_file = "data/WineNewGBTDataSet.csv"
col_Names=["fixedAcidity",
    "volatileAcidity",
    "citricAcid",
    "residualSugar",
    "chlorides",
    "freeSulfurDioxide",
    "totalSulfurDioxide",
    "density",
    "pH",
    "sulphates",
    "Alcohol",
    "Quality"]


wine_df = pd.read_csv(input_file,sep=";",header=None, names=col_Names)
wine_df.head()

Unnamed: 0,fixedAcidity,volatileAcidity,citricAcid,residualSugar,chlorides,freeSulfurDioxide,totalSulfurDioxide,density,pH,sulphates,Alcohol,Quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Poor
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Poor
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Poor
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Excellent
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Poor


In [4]:
# #### Cleanup - Remove invalid data
wine_df.Quality.replace('1',"Excellent",inplace=True)

# ### encode labels 
wine_df.Quality = pd.Categorical(wine_df.Quality)
wine_df['Label'] = wine_df.Quality.cat.codes
wine_df.head()


Unnamed: 0,fixedAcidity,volatileAcidity,citricAcid,residualSugar,chlorides,freeSulfurDioxide,totalSulfurDioxide,density,pH,sulphates,Alcohol,Quality,Label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Poor,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Poor,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Poor,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Excellent,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Poor,1


In [5]:
# # 2. Build a classification model using MLLib
# ### Split Test/Train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wine_df.iloc[:,:11],
                                                    wine_df['Label'], 
                                                    test_size=0.2, 
                                                    random_state=30)



In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# ### parameters for grid search
param_numTrees = list(range(10,100,5))
param_maxDepth = list(range(4,32,2))

rfc = RandomForestClassifier(random_state=10, n_jobs=-1)

GS_params = { 
    'n_estimators': param_numTrees,
    'max_depth' : param_maxDepth
}

CV_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=GS_params, 
                      cv= 3,
                      verbose = 1,
                      n_jobs=-1)

In [9]:
CV_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 252 candidates, totalling 756 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 756 out of 756 | elapsed:   32.3s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=10,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_

In [10]:
# Run a Dask cluster with three workers and return an object containing
# a description of the cluster. 
# 
# Using helper library 
#
# Note that the scheduler will run in the current session, and the Dask
# dashboard will become available in the nine-dot menu at the upper
# right corner of the CDSW app.

cluster = cdsw_dask_utils.run_dask_cluster(
  n=2, \
  cpu=1, \
  memory=2, \
  nvidia_gpu=0
)

Waiting for Dask scheduler to become ready...
Dask scheduler is ready
IDs ['idff953o12g3j6f9', 'j3k9n6ur7fh2f6t6']


In [11]:
## Print Workers config

import json
worker_num=0
for worker in cluster["workers"] : 
  print("worker {} :".format(worker_num))
  print(json.dumps(worker, indent=4))
  worker_num +=1

worker 0 :
{
    "id": "idff953o12g3j6f9",
    "name": "Untitled Worker",
    "project": {
        "id": 1,
        "name": "dask_distrib",
        "slug": "admin/dask_distrib",
        "html_url": "http://mlamairessedask-4.vpc.cloudera.com/admin/dask_distrib",
        "url": "http://mlamairessedask-4.vpc.cloudera.com/api/v1/projects/admin/dask_distrib"
    },
    "owner": {
        "id": 1,
        "username": "admin",
        "name": "CDEP CREATED ACCOUNT",
        "html_url": "http://mlamairessedask-4.vpc.cloudera.com/admin",
        "url": "http://mlamairessedask-4.vpc.cloudera.com/api/v1/users/admin",
        "is_team": false
    },
    "creator": {
        "id": 1,
        "username": "admin",
        "name": "CDEP CREATED ACCOUNT",
        "html_url": "http://mlamairessedask-4.vpc.cloudera.com/admin",
        "url": "http://mlamairessedask-4.vpc.cloudera.com/api/v1/users/admin"
    },
    "biller": {
        "id": 1,
        "username": "admin",
        "name": "CDEP CREATED ACC

In [12]:
# #### Connect a Dask client to the scheduler address in the cluster

from dask.distributed import Client
import joblib

client = Client(cluster["scheduler_address"])

In [12]:
## print dask cluster config 

import json
print(json.dumps(client.scheduler_info(), indent=4))

{
    "type": "Scheduler",
    "id": "Scheduler-dc4fe695-386e-48b4-be16-35abe95975e3",
    "address": "tcp://100.66.0.29:2323",
    "services": {},
    "workers": {
        "tcp://100.66.0.30:44898": {
            "type": "Worker",
            "id": "tcp://100.66.0.30:44898",
            "host": "100.66.0.30",
            "resources": {},
            "local_directory": "/home/cdsw/worker-7r41fmdn",
            "name": "tcp://100.66.0.30:44898",
            "nthreads": 8,
            "memory_limit": 1999998976,
            "last_seen": 1570088973.1057022,
            "services": {},
            "metrics": {
                "cpu": 2.0,
                "memory": 31866880,
                "time": 1570088973.1042278,
                "read_bytes": 285.8447527058926,
                "write_bytes": 763.5852834521046,
                "num_fds": 23,
                "executing": 0,
                "in_memory": 0,
                "ready": 0,
                "in_flight": 0,
                "bandwid

In [None]:
# ### Fit Model with Dask
with joblib.parallel_backend('dask'):
    CV_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 252 candidates, totalling 756 fits


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.9s


In [None]:
# ### Show Best Parameters 
print(CV_rfc.best_params_)