In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  MinMaxScaler
import pickle

  from numpy.core.umath_tests import inner1d


### Read the CSV and Perform Preprocessing

In [2]:
df = pd.read_csv("texas_flights.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'DISTANCE', 'ARRIVAL_DELAY',
       'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY'],
      dtype='object')

In [4]:
df = df.drop(['Unnamed: 0','YEAR','FLIGHT_NUMBER','TAIL_NUMBER'], axis=1)

In [5]:
df.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,DISTANCE,ARRIVAL_DELAY,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,1,1,4,OO,MAF,IAH,5,0,429,0,1,2,0,0,0,0,0
1,1,1,4,AA,DFW,MIA,5,108,1121,102,0,0,0,0,0,0,102
2,1,1,4,AA,IAH,MIA,5,58,964,54,0,0,0,0,54,0,0
3,1,1,4,EV,BRO,IAH,5,-3,308,-5,0,0,0,0,0,0,0
4,1,1,4,EV,CRP,IAH,5,-12,201,-21,0,0,0,0,0,0,0


In [6]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687946 entries, 0 to 687945
Data columns (total 17 columns):
MONTH                  687946 non-null int64
DAY                    687946 non-null int64
DAY_OF_WEEK            687946 non-null int64
AIRLINE                687946 non-null object
ORIGIN_AIRPORT         687946 non-null object
DESTINATION_AIRPORT    687946 non-null object
SCHEDULED_DEPARTURE    687946 non-null int64
DEPARTURE_DELAY        687946 non-null int64
DISTANCE               687946 non-null int64
ARRIVAL_DELAY          687946 non-null int64
CANCELLED              687946 non-null int64
CANCELLATION_REASON    687946 non-null int64
AIR_SYSTEM_DELAY       687946 non-null int64
SECURITY_DELAY         687946 non-null int64
AIRLINE_DELAY          687946 non-null int64
LATE_AIRCRAFT_DELAY    687946 non-null int64
WEATHER_DELAY          687946 non-null int64
dtypes: int64(14), object(3)
memory usage: 89.2+ MB


In [7]:
# def determine_delay_levels(row):
#     if row['DEPARTURE_DELAY'] <=0:
#         val = 0
#     elif (row['DEPARTURE_DELAY'] > 0) & (row['DEPARTURE_DELAY'] <= 15):
#         val = 1
#     elif (row['DEPARTURE_DELAY'] > 15) & (row['DEPARTURE_DELAY'] <= 120):
#         val = 2
#     else:
#         val = 3
#     return val
# df['Delay_levels'] = df.apply(determine_delay_levels, axis=1)

In [8]:
def determine_delay_levels(row):
    if row['DEPARTURE_DELAY'] <=0:
        val = 0
    elif (row['DEPARTURE_DELAY'] > 0) & (row['DEPARTURE_DELAY'] <= 15):
        val = 1
    else:
        val = 2
    return val
df['Delay_levels'] = df.apply(determine_delay_levels, axis=1)

In [9]:
df["AIRLINE_DELAY"] = np.where(df['AIRLINE_DELAY']> 0, 1, 0)
df["AIR_SYSTEM_DELAY"] = np.where(df['AIR_SYSTEM_DELAY']> 0, 1, 0)
df["SECURITY_DELAY"] = np.where(df['SECURITY_DELAY']> 0, 1, 0)
df["LATE_AIRCRAFT_DELAY"] = np.where(df['LATE_AIRCRAFT_DELAY']> 0, 1, 0)
df["WEATHER_DELAY"] = np.where(df['WEATHER_DELAY']> 0, 1, 0)

In [10]:
df = df.drop(df[(df['CANCELLED'] == 1) & (df['Delay_levels'] > 0)].index)


In [11]:
Day_feat = pd.get_dummies(df['DAY_OF_WEEK'])
Month_feat = pd.get_dummies(df['MONTH'])
Airline_feat = pd.get_dummies(df['AIRLINE'])
Day_hour_feat = pd.get_dummies(df['SCHEDULED_DEPARTURE'])
Origin_Airport_feat = pd.get_dummies(df['ORIGIN_AIRPORT'])
Dest_Airport_feat = pd.get_dummies(df['DESTINATION_AIRPORT'])
Distance_feat = df["DISTANCE"].to_frame()
Air_System_feat = df["AIR_SYSTEM_DELAY"].to_frame()
Security_feat = df["SECURITY_DELAY"].to_frame()
Aircraft_feat = df["LATE_AIRCRAFT_DELAY"].to_frame()
Weather_feat = df["WEATHER_DELAY"].to_frame()
Airline_delay_feat = df["AIRLINE_DELAY"].to_frame()

In [12]:
#X = pd.concat([Month_feat,Day_feat,Day_hour_feat,Airline_feat,Origin_Airport_feat,\
#              Air_System_feat,Security_feat,Aircraft_feat,Weather_feat,Distance_feat],axis=1)
X = pd.concat([Month_feat,Day_feat,Day_hour_feat,Airline_feat,Origin_Airport_feat,\
              Air_System_feat,Security_feat,Aircraft_feat,Weather_feat,Distance_feat,Airline_delay_feat],axis=1)
y = df["Delay_levels"]
print(X.shape,y.shape)

(687412, 83) (687412,)


In [13]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,SAT,SJT,SPS,TYR,AIR_SYSTEM_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DISTANCE,AIRLINE_DELAY
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,429,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1121,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,964,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,308,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,201,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)
X_train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,SAT,SJT,SPS,TYR,AIR_SYSTEM_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DISTANCE,AIRLINE_DELAY
335370,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1121,0
346495,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1372,0
9811,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,248,1
245444,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,569,0
219992,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3784,0


In [15]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
clf = RandomForestClassifier(n_estimators=100, max_depth=85,random_state=42, max_features = None,min_samples_leaf =10)

In [17]:
clf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=85, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [18]:
clf.score(X_train_scaled, y_train)

0.7822402479638606

In [19]:
clf.score(X_test_scaled, y_test)

0.7596085026156075

In [20]:
from sklearn.metrics import classification_report

In [21]:
predictions = clf.predict(X_test_scaled)

In [22]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.76      0.95      0.84    103301
          1       0.49      0.17      0.26     35375
          2       0.90      0.78      0.84     33177

avg / total       0.73      0.76      0.72    171853



In [23]:
df.to_csv('delay_more.csv')