
# <center> PySpark with OHE using pandas


## Import required functions & libraries

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import xgboost as xgb
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, SQLTransformer,OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from IPython.display import Image
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lpad
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import time

## Import required Dataset

In [3]:
t_import=time.time()
df = pd.read_csv('Delay_20k.csv',header = 0, index_col = 0)

## Data Preparation

### Check data type & dimensions of dataset

In [4]:
t_Data_Prep=time.time()
print(type(df))
print(df.count(),",",len(df.columns))

<class 'pyspark.sql.dataframe.DataFrame'>
20000 , 30


### Add DepDelayFlag and DepHour fields

In [5]:
df['DepDelayFlag'] = np.where(df['DepDelay']>=15, 1, 0)

In [6]:
df['CRSDepTime'] = df['CRSDepTime'].astype(str)
df['CRSDepTime'] = df['CRSDepTime'].apply(lambda x: x.zfill(4))
df['DepHour'] = df.CRSDepTime.str[:2]

### Print schema

In [8]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: integer (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- C

### Print first row of data

In [9]:
df.show(1)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+-------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|DepDelayFlag|DepHour|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+------------+-------+
|2008|    3|        29|        

### Statistics of columns in data

In [10]:
df.describe()

Unnamed: 0,summary,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepDelayFlag,DepHour
0,count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000,...,20000.0,20000,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
1,mean,2008.0,6.064,15.6512,3.9635,1516.20085,1467.7472,1609.9583855602907,1633.8333,,...,0.00035,,0.00295,19.12344236760125,3.7969626168224297,14.982165109034268,0.1129283489096573,24.74618380062305,0.6851,14.4004
2,stddev,0.0,3.4653289057781,8.805697903958512,1.9985413326656316,453.25061016512575,426.463336690314,547.7696403543224,466.3067525894143,,...,0.0187054803399896,,0.0542350861733307,41.85493154128528,21.683215301413732,32.77319990903296,3.0435253392568837,41.30050344087439,0.4644876504697092,4.261336385383045
3,min,2008.0,1.0,1.0,1.0,1.0,10.0,1.0,1.0,9E,...,0.0,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,2008.0,12.0,31.0,7.0,2400.0,2359.0,,2400.0,YV,...,1.0,N,1.0,,,,,,1.0,23.0


### Select categorical columns and target variable 

In [None]:
allData = df.filter(["Origin","Dest","Distance","Month","DayOfWeek","UniqueCarrier","DepHour","DepDelayFlag"],axis = 1)

### Convert categorical columns to dummy columns using One hot encoding

In [13]:
def process_categorical_features(allData):
    dummies_origin = pd.get_dummies(allData.Origin, prefix="Origin", drop_first=True)
    dummies_dest = pd.get_dummies(allData.Dest, prefix="Dest", drop_first=True)
    dummies_distance = pd.get_dummies(allData.Distance, prefix="Distance", drop_first=True)
    dummies_month = pd.get_dummies(allData.Month, prefix="Month", drop_first=True)
    dummies_dayofweek = pd.get_dummies(allData.DayOfWeek, prefix="DayOfWeek", drop_first=True)
    dummies_uniquecarrier = pd.get_dummies(allData.UniqueCarrier, prefix="UniqueCarrier", drop_first=True)
    dummies_dephour = pd.get_dummies(allData.DepHour, prefix = "DepHour", drop_first=True)
    allData.drop(["Origin", "Dest","Distance","Month","DayOfWeek","UniqueCarrier", "DepHour"], axis=1, inplace=True)
    return pd.concat([allData, dummies_origin, dummies_dest,dummies_distance,dummies_month,dummies_dayofweek,dummies_uniquecarrier,dummies_dephour], axis=1)
allData = process_categorical_features(allData)

['Origin_t', 'Dest_t', 'Distance_t', 'Month_t', 'DayOfWeek_t', 'UniqueCarrier_t', 'DepHour_t']


### Group predictors as "x" and response as "y"

In [15]:
y = allData['DepDelayFlag']
x = allData[[col for col in allData.columns if col!='DepDelayFlag']]

## Model Development

### Split the dataset into train & test

In [18]:
t_model=time.time()
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.25, random_state=42)

Distribution of Pos and Neg in trainingData is:  [Row(label=0.0, count=10245), Row(label=1.0, count=4724)]


### Fit training data

In [None]:
params = {           'nthread':-1,
                      'seed' : 42,
                      'colsample_bytree': 0.8,
                      'subsample': 0.8, 
                      'learning_rate': 0.2,
                      'max_depth': 7
                       }
dtrain = xgb.DMatrix(x_train.as_matrix(),label=y_train.as_matrix())
dtest = xgb.DMatrix(x_test.as_matrix(),label=y_test.as_matrix())
classifier = xgb.train(params, dtrain, num_boost_round=60)
y_pred = classifier.predict(dtest)
cm = confusion_matrix(y_test, (y_pred>0.5))
predict_accuracy_on_test_set = (cm[0,0] + cm[1,1])/(cm[0,0] + cm[1,1]+cm[1,0] + cm[0,1])
print (predict_accuracy_on_test_set)
t_end=time.time()

## Exporting Results

In [20]:
Overall_time=t_end-t_import
Data_Extraction=t_Data_Prep-t_import
Data_Preparation=t_model-t_Data_Prep
Model=t_end-t_model

result_df=pd.DataFrame({'Overall_time':[Overall_time],
                       'Data_Extraction_time':[Data_Extraction],
                       'Data_Prepartion':[Data_Preparation],
                       'Model_time':[Model],
                       'Accuracy':predict_accuracy_on_test_set})
result_df.to_excel('Results_XGB_20k.xlsx')