## Part 1.  Pre-processing in Pandas

In [1]:
# Note that this code must not run directly the Workbench in the graphical console. 
# Use python3, ipython3 or Jupyter notebook in a session's terminal, 
# See https://github.com/dask/dask/issues/4612

import cdsw_dask_utils
import cdsw
import pandas as pd

In [21]:
# read flights dataset
input_file = "data/1988.csv.bz2"
columns = ['Month','DayofMonth','DayOfWeek','CRSDepTime','CRSArrTime','UniqueCarrier','FlightNum',
           'Origin','Dest','Cancelled']
flight_df = pd.read_csv(input_file,sep=',',header=0, na_values='NA', usecols=columns )
flight_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,Origin,Dest,Cancelled
0,1,9,6,1331,1435,PI,942,SYR,BWI,0
1,1,10,7,1331,1435,PI,942,SYR,BWI,0
2,1,11,1,1331,1435,PI,942,SYR,BWI,0
3,1,12,2,1331,1435,PI,942,SYR,BWI,0
4,1,13,3,1331,1435,PI,942,SYR,BWI,0


In [22]:
flight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5202096 entries, 0 to 5202095
Data columns (total 10 columns):
Month            int64
DayofMonth       int64
DayOfWeek        int64
CRSDepTime       int64
CRSArrTime       int64
UniqueCarrier    object
FlightNum        int64
Origin           object
Dest             object
Cancelled        int64
dtypes: int64(7), object(3)
memory usage: 396.9+ MB


### Feature Engineering
#### 1. create routes from origin and destination

In [3]:
import numpy as np
# feature engineering - Create routes
flight_df['route'] = flight_df['Origin'] + "_" + flight_df['Dest']

# show top 20 routes - As defined by nb of flights
top_routes = flight_df['route'].value_counts(ascending=False)
top_routes.head(10)

LAX_SFO    20750
SFO_LAX    20658
LAX_PHX    13461
PHX_LAX    13273
LAX_LAS    12175
LGA_BOS    12027
LAS_LAX    11801
SJC_LAX    11535
LAX_SJC    11292
BOS_LGA    11141
Name: route, dtype: int64

In [4]:
#look at their cancelations
flight_df[['route', 'Cancelled']].groupby('route')\
        .agg({'route':'size', 'Cancelled':'sum'}) \
        .rename(columns={'route':'count','Cancelled':'nb_cancelled'}) \
        .reset_index()\
        .sort_values(['count'],ascending=False)\
        .head(10)

Unnamed: 0,route,count,nb_cancelled
1920,LAX_SFO,20750,228
3375,SFO_LAX,20658,206
1911,LAX_PHX,13461,78
2870,PHX_LAX,13273,71
1894,LAX_LAS,12175,58
1956,LGA_BOS,12027,287
1840,LAS_LAX,11801,47
3432,SJC_LAX,11535,71
1921,LAX_SJC,11292,71
366,BOS_LGA,11141,243


In [5]:
#focus on 50 biggest routes - As defined by nb of flights 
route_lst=top_routes.head(50)
flight_df = flight_df[flight_df['route'].isin(route_lst.index)]
flight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487253 entries, 786 to 5201496
Data columns (total 11 columns):
Month            487253 non-null int64
DayofMonth       487253 non-null int64
DayOfWeek        487253 non-null int64
CRSDepTime       487253 non-null int64
CRSArrTime       487253 non-null int64
UniqueCarrier    487253 non-null object
FlightNum        487253 non-null int64
Origin           487253 non-null object
Dest             487253 non-null object
Cancelled        487253 non-null int64
route            487253 non-null object
dtypes: int64(7), object(4)
memory usage: 44.6+ MB


In [6]:
# Quick sanity check - count number of null values()
flight_df.isnull().sum()

Month            0
DayofMonth       0
DayOfWeek        0
CRSDepTime       0
CRSArrTime       0
UniqueCarrier    0
FlightNum        0
Origin           0
Dest             0
Cancelled        0
route            0
dtype: int64

#### 2. Feature and label encoding encoding
Label encoding

In [7]:
# ### encode labels 
flight_df.Cancelled = pd.Categorical(flight_df.Cancelled)
flight_df['Label'] = flight_df.Cancelled.cat.codes
flight_df.drop(['Cancelled'], axis=1, inplace=True)

Feature encoding 

In [8]:
# Get airport unique values
import numpy as np
airport_list = np.unique(np.concatenate((np.sort(flight_df['Origin'].unique()),
                                         np.sort(flight_df['Dest'].unique())))
                        )
airport_list

array(['ATL', 'BOS', 'DAL', 'DCA', 'DEN', 'DFW', 'DTW', 'EWR', 'HOU',
       'IAH', 'LAS', 'LAX', 'LGA', 'MCO', 'MIA', 'MSP', 'ORD', 'PDX',
       'PHX', 'SAN', 'SAT', 'SEA', 'SFO', 'SJC', 'STL'], dtype=object)

In [9]:
# Encode categorical features - Sklearn only supports numerical values
from sklearn.preprocessing import LabelEncoder

# encode airlines 
le_carrier = LabelEncoder()
flight_df['Carrier_encoded'] = le_carrier.fit_transform(flight_df['UniqueCarrier'])

# Encode airports : Using same encoder for both origin and dest ( consistent encoding of airports )
le_airport = LabelEncoder()
le_airport.fit(airport_list)
flight_df['Origin_encoded'] = le_airport.transform(flight_df['Origin'])
flight_df['Dest_encoded'] = le_airport.transform(flight_df['Dest'])

# Encode routes 
le_route = LabelEncoder()
flight_df['route_encoded'] = le_route.fit_transform(flight_df['route'])


In [10]:
# Viewing few rows of make and its encoded columns
flight_df[['UniqueCarrier','Carrier_encoded','Origin','Origin_encoded',
           'Dest', 'Dest_encoded', 'route', 'route_encoded' ]].sample(10)

Unnamed: 0,UniqueCarrier,Carrier_encoded,Origin,Origin_encoded,Dest,Dest_encoded,route,route_encoded
1860161,HP,5,PHX,18,LAX,11,PHX_LAX,36
2265131,WN,13,LAS,10,PHX,18,LAS_PHX,15
932249,UA,11,LAX,11,SFO,22,LAX_SFO,20
3888083,US,12,SFO,22,SEA,21,SFO_SEA,47
1865490,HP,5,LAX,11,PHX,18,LAX_PHX,17
2266661,EA,4,BOS,1,DCA,3,BOS_DCA,1
3308115,DL,3,PHX,18,LAS,10,PHX_LAS,35
1641615,AA,0,IAH,9,DFW,5,IAH_DFW,13
4636558,EA,4,BOS,1,LGA,12,BOS_LGA,3
4735793,CO,2,EWR,7,BOS,1,EWR_BOS,10


In [11]:
# # 2. Build a classification model using MLLib
# ### Split Test/Train

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(flight_df.drop(['UniqueCarrier','Origin','Dest','route'],axis=1),
                                                    flight_df['Label'], 
                                                    test_size=0.3, 
                                                    random_state=100)
del flight_df # free up some memory

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 341077 entries, 936357 to 3164854
Data columns (total 11 columns):
Month              341077 non-null int64
DayofMonth         341077 non-null int64
DayOfWeek          341077 non-null int64
CRSDepTime         341077 non-null int64
CRSArrTime         341077 non-null int64
FlightNum          341077 non-null int64
Label              341077 non-null int8
Carrier_encoded    341077 non-null int64
Origin_encoded     341077 non-null int64
Dest_encoded       341077 non-null int64
route_encoded      341077 non-null int64
dtypes: int64(10), int8(1)
memory usage: 28.9 MB


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146176 entries, 1856532 to 3419091
Data columns (total 11 columns):
Month              146176 non-null int64
DayofMonth         146176 non-null int64
DayOfWeek          146176 non-null int64
CRSDepTime         146176 non-null int64
CRSArrTime         146176 non-null int64
FlightNum          146176 non-null int64
Label              146176 non-null int8
Carrier_encoded    146176 non-null int64
Origin_encoded     146176 non-null int64
Dest_encoded       146176 non-null int64
route_encoded      146176 non-null int64
dtypes: int64(10), int8(1)
memory usage: 12.4 MB


## Part 2 : Model Training - Using SKLEARN 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# ### parameters for grid search
param_numTrees = list(range(2,12,2))
print(param_numTrees)
param_maxDepth = list(range(2,12,2))
print(param_maxDepth)

rfc = RandomForestClassifier(random_state=10, n_jobs=-1)

GS_params = { 
    'n_estimators': param_numTrees,
    'max_depth' : param_maxDepth
}

CV_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=GS_params, 
                      cv= 3,
                      verbose = 1,
                      n_jobs=-1)

In [None]:
## Train model
CV_rfc.fit(X_train, y_train)

In [24]:
# ### Show Best Parameters 
print(CV_rfc.best_params_)

{'max_depth': 2, 'n_estimators': 2}


### Evaluate model performance

In [None]:
from sklearn.metrics import classification_report
# ## fit test dataset 
y_true, y_pred = y_test, CV_rfc.predict(X_test)
# ## Print classification metrics 
print(classification_report(y_true, y_pred))

## Part 3. Distributed training using DASK Backend

In [14]:
# Run a Dask cluster with three workers and return an object containing
# a description of the cluster. 
# 
# Using helper library 
#
# Note that the scheduler will run in the current session, and the Dask
# dashboard will become available in the nine-dot menu at the upper
# right corner of the CDSW app.

cluster = cdsw_dask_utils.run_dask_cluster(
  n=2, \
  cpu=1, \
  memory=3, \
  nvidia_gpu=0
)

Waiting for Dask scheduler to become ready...
Dask scheduler is ready
IDs ['gxyld8kqktr76c0o', 'ltw6rul8aqawl8mj']


In [15]:
#Get the Dask Scheduler UI
import os 
engine_id = os.environ.get('CDSW_ENGINE_ID')
cdsw_domain = os.environ.get('CDSW_DOMAIN')

from IPython.core.display import HTML
HTML('<a  target="_blank" rel="noopener noreferrer" href="http://read-only-{}.{}">http://read-only-{}.{}</a>'
     .format(engine_id,cdsw_domain,engine_id,cdsw_domain))

In [16]:
# #### Connect a Dask client to the scheduler address in the cluster

from dask.distributed import Client
client = Client(cluster["scheduler_address"])

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# ### parameters for grid search
param_numTrees = list(range(2,12,2))
print(param_numTrees)
param_maxDepth = list(range(2,12,2))
print(param_maxDepth)

rfc = RandomForestClassifier(random_state=10, n_jobs=-1)

GS_params = { 
    'n_estimators': param_numTrees,
    'max_depth' : param_maxDepth
}

CV_rfc = GridSearchCV(estimator=rfc, 
                      param_grid=GS_params, 
                      cv= 3,
                      verbose = 1,
                      n_jobs=-1)

[2, 4, 6, 8, 10]
[2, 4, 6, 8, 10]


In [18]:
# ### Fit Model with Dask
from joblib import Parallel, parallel_backend
with parallel_backend('dask'):
  CV_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   20.4s finished


In [19]:
# ### Show Best Parameters 
print(CV_rfc.best_params_)

{'max_depth': 2, 'n_estimators': 2}


### Evaluate model performance

In [20]:
from sklearn.metrics import classification_report
# ## fit test dataset 
y_true, y_pred = y_test, CV_rfc.predict(X_test)
# ## Print classification metrics 
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    144393
           1       1.00      1.00      1.00      1783

    accuracy                           1.00    146176
   macro avg       1.00      1.00      1.00    146176
weighted avg       1.00      1.00      1.00    146176



In [None]:
## print dask cluster config 

import json
print(json.dumps(client.scheduler_info(), indent=4))