### Importing dependencies and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle

  from numpy.core.umath_tests import inner1d


### Read the CSV and Perform Preprocessing

In [2]:
df = pd.read_csv("data/texas_flights.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_DELAY', 'DISTANCE', 'ARRIVAL_DELAY', 'CANCELLED',
       'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY',
       'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,DISTANCE,ARRIVAL_DELAY,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,1.0,1.0,4.0,OO,MAF,IAH,5.0,0.0,429.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,4.0,AA,DFW,MIA,5.0,108.0,1121.0,102.0,0.0,0.0,0.0,0.0,0.0,0.0,102.0
2,1.0,1.0,4.0,AA,IAH,MIA,5.0,58.0,964.0,54.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0
3,1.0,1.0,4.0,EV,BRO,IAH,5.0,-3.0,308.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,4.0,EV,CRP,IAH,5.0,-12.0,201.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687946 entries, 0 to 687945
Data columns (total 17 columns):
MONTH                  687946 non-null float64
DAY                    687946 non-null float64
DAY_OF_WEEK            687946 non-null float64
AIRLINE                687946 non-null object
ORIGIN_AIRPORT         687946 non-null object
DESTINATION_AIRPORT    687946 non-null object
SCHEDULED_DEPARTURE    687946 non-null float64
DEPARTURE_DELAY        687946 non-null float64
DISTANCE               687946 non-null float64
ARRIVAL_DELAY          687946 non-null float64
CANCELLED              687946 non-null float64
CANCELLATION_REASON    687946 non-null float64
AIR_SYSTEM_DELAY       687946 non-null float64
SECURITY_DELAY         687946 non-null float64
AIRLINE_DELAY          687946 non-null float64
LATE_AIRCRAFT_DELAY    687946 non-null float64
WEATHER_DELAY          687946 non-null float64
dtypes: float64(14), object(3)
memory usage: 89.2+ MB


### Creating 3 classes [No delay, Short delay (<15 min), Long delay (> 15 min)of delay depending upon the departure delay time in the data.

In [7]:
def determine_delay_levels(row):
    if row['DEPARTURE_DELAY'] <=0:
        val = 0
    elif (row['DEPARTURE_DELAY'] > 0) & (row['DEPARTURE_DELAY'] <= 15):
        val = 1
    else:
        val = 2
    return val
df['Delay_levels'] = df.apply(determine_delay_levels, axis=1)

#### Forcefully one-hot-encoding of the Delay code reasons in the data. All the reasons are being given equal weights. Simply because, it was seen tha 32 min delay can happen both due to Security and Weather issues. There is nothing that can distinguish a longer and a shorter delay.

In [8]:
df["AIRLINE_DELAY"] = np.where(df['AIRLINE_DELAY']> 0, 1, 0)
df["AIR_SYSTEM_DELAY"] = np.where(df['AIR_SYSTEM_DELAY']> 0, 1, 0)
df["SECURITY_DELAY"] = np.where(df['SECURITY_DELAY']> 0, 1, 0)
df["LATE_AIRCRAFT_DELAY"] = np.where(df['LATE_AIRCRAFT_DELAY']> 0, 1, 0)
df["WEATHER_DELAY"] = np.where(df['WEATHER_DELAY']> 0, 1, 0)

###### Flights which were marked cancelled were also marked delayed. This causes noise in the final model. Hence dropping them.

In [9]:
df = df.drop(df[(df['CANCELLED'] == 1) & (df['Delay_levels'] > 0)].index)


### Feature engineering. Creating separate dataframes for each feature and then concatenating them to a single dataframe.

In [10]:
Day_feat = pd.get_dummies(df['DAY_OF_WEEK'])
Month_feat = pd.get_dummies(df['MONTH'])
Airline_feat = pd.get_dummies(df['AIRLINE'])
Day_hour_feat = pd.get_dummies(df['SCHEDULED_DEPARTURE'])
Origin_Airport_feat = pd.get_dummies(df['ORIGIN_AIRPORT'])
Dest_Airport_feat = pd.get_dummies(df['DESTINATION_AIRPORT'])
Distance_feat = df["DISTANCE"].to_frame()
Air_System_feat = df["AIR_SYSTEM_DELAY"].to_frame()
Security_feat = df["SECURITY_DELAY"].to_frame()
Aircraft_feat = df["LATE_AIRCRAFT_DELAY"].to_frame()
Weather_feat = df["WEATHER_DELAY"].to_frame()
Airline_delay_feat = df["AIRLINE_DELAY"].to_frame()

In [11]:

X = pd.concat([Month_feat,Day_feat,Day_hour_feat,Airline_feat,Origin_Airport_feat,\
              Air_System_feat,Security_feat,Aircraft_feat,Weather_feat,Distance_feat,Airline_delay_feat],axis=1)
y = df["Delay_levels"]
print(X.shape,y.shape)

(687412, 83) (687412,)


In [12]:
X.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,SAT,SJT,SPS,TYR,AIR_SYSTEM_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DISTANCE,AIRLINE_DELAY
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,429.0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1121.0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,964.0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,308.0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,201.0,0


### Performing a train-test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)
X_train.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,SAT,SJT,SPS,TYR,AIR_SYSTEM_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DISTANCE,AIRLINE_DELAY
335370,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1121.0,0
346495,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1372.0,0
9811,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,248.0,1
245444,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,569.0,0
219992,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3784.0,0


#### Scaling and transforming and using Random Forest as a classifier as learnt from the Cancellation exercise.

In [14]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
 clf = RandomForestClassifier(n_estimators=100, random_state=42, max_features = None)

### Doing a grid search for the parameters and using the experience from the cancellation exercise.

In [16]:
param_grid = {"max_depth": [3, 85],
              "min_samples_leaf": [2, 10]}

In [17]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, verbose=3)
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] max_depth=3, min_samples_leaf=2 .................................
[CV]  max_depth=3, min_samples_leaf=2, score=0.7498632560196445, total= 2.7min
[CV] max_depth=3, min_samples_leaf=2 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.7min remaining:    0.0s


[CV]  max_depth=3, min_samples_leaf=2, score=0.7495999487934456, total= 2.7min
[CV] max_depth=3, min_samples_leaf=2 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.4min remaining:    0.0s


[CV]  max_depth=3, min_samples_leaf=2, score=0.7494239229104113, total= 2.6min
[CV] max_depth=3, min_samples_leaf=10 ................................
[CV]  max_depth=3, min_samples_leaf=10, score=0.7498632560196445, total= 2.6min
[CV] max_depth=3, min_samples_leaf=10 ................................
[CV]  max_depth=3, min_samples_leaf=10, score=0.7495999487934456, total= 6.2min
[CV] max_depth=3, min_samples_leaf=10 ................................
[CV]  max_depth=3, min_samples_leaf=10, score=0.7494239229104113, total= 2.6min
[CV] max_depth=85, min_samples_leaf=2 ................................
[CV]  max_depth=85, min_samples_leaf=2, score=0.7336052695893025, total= 9.2min
[CV] max_depth=85, min_samples_leaf=2 ................................
[CV]  max_depth=85, min_samples_leaf=2, score=0.7334116948787627, total= 9.1min
[CV] max_depth=85, min_samples_leaf=2 ................................
[CV]  max_depth=85, min_samples_leaf=2, score=0.7336312641109792, total= 9.0min
[CV] max_depth=

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 74.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 85], 'min_samples_leaf': [2, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

### It can be seen, with above grid search, that max_depth = 85 and min_samples_leaf of 10 are giving good scores

In [18]:

clf = RandomForestClassifier(n_estimators=100, max_depth=85,random_state=42, max_features = None,min_samples_leaf =10)

In [20]:
clf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=85, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [21]:
clf.score(X_test_scaled, y_test)

0.7596085026156075

In [22]:
predictions = clf.predict(X_test_scaled)

In [23]:
print(classification_report(y_test, predictions,target_names=["Not Delayed",\
                                                              "Short Delay(<15 min)","Longer Delay(>15 min)"]))

                       precision    recall  f1-score   support

          Not Delayed       0.76      0.95      0.84    103301
 Short Delay(<15 min)       0.49      0.17      0.26     35375
Longer Delay(>15 min)       0.90      0.78      0.84     33177

          avg / total       0.73      0.76      0.72    171853



#### scikit-learn v0.21.3 has an option for a dictionary output for classification report not the version (0.19.2) I am using, hence exporting values to CSV

In [24]:
pred_series = pd.Series(predictions)

In [25]:
pred_series.to_csv('delay_predictions.csv')

  """Entry point for launching an IPython kernel.


In [26]:
y_test.to_csv('delay_actual.csv')

  """Entry point for launching an IPython kernel.
