In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import pickle
from sklearn.utils import resample

  from numpy.core.umath_tests import inner1d


### Read the CSV and Perform Preprocessing

In [2]:
df = pd.read_csv("data/texas_flights.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE',
       'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE',
       'DEPARTURE_DELAY', 'DISTANCE', 'ARRIVAL_DELAY', 'CANCELLED',
       'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY',
       'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,DISTANCE,ARRIVAL_DELAY,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,0,1.0,1.0,4.0,OO,MAF,IAH,5.0,0.0,429.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,1.0,4.0,AA,DFW,MIA,5.0,108.0,1121.0,102.0,0.0,0.0,0.0,0.0,0.0,0.0,102.0
2,2,1.0,1.0,4.0,AA,IAH,MIA,5.0,58.0,964.0,54.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0
3,3,1.0,1.0,4.0,EV,BRO,IAH,5.0,-3.0,308.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1.0,1.0,4.0,EV,CRP,IAH,5.0,-12.0,201.0,-21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df = df.drop(['Unnamed: 0'], axis=1)

In [6]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687946 entries, 0 to 687945
Data columns (total 17 columns):
MONTH                  687946 non-null float64
DAY                    687946 non-null float64
DAY_OF_WEEK            687946 non-null float64
AIRLINE                687946 non-null object
ORIGIN_AIRPORT         687946 non-null object
DESTINATION_AIRPORT    687946 non-null object
SCHEDULED_DEPARTURE    687946 non-null float64
DEPARTURE_DELAY        687946 non-null float64
DISTANCE               687946 non-null float64
ARRIVAL_DELAY          687946 non-null float64
CANCELLED              687946 non-null float64
CANCELLATION_REASON    687946 non-null float64
AIR_SYSTEM_DELAY       687946 non-null float64
SECURITY_DELAY         687946 non-null float64
AIRLINE_DELAY          687946 non-null float64
LATE_AIRCRAFT_DELAY    687946 non-null float64
WEATHER_DELAY          687946 non-null float64
dtypes: float64(14), object(3)
memory usage: 89.2+ MB


In [7]:
df = df.drop(df[(df['CANCELLED'] == 1) & (df['DEPARTURE_DELAY'] > 0)].index)

In [8]:
Day_feat = pd.get_dummies(df['DAY_OF_WEEK'])
Month_feat = pd.get_dummies(df['MONTH'])
Airline_feat = pd.get_dummies(df['AIRLINE'])
Day_hour_feat = pd.get_dummies(df['SCHEDULED_DEPARTURE'])
Origin_Airport_feat = pd.get_dummies(df['ORIGIN_AIRPORT'])
Distance_feat = df["DISTANCE"].to_frame()
Reason_feat = df["CANCELLATION_REASON"].to_frame()

In [9]:
X = pd.concat([Month_feat,Day_feat,Day_hour_feat,Airline_feat, Origin_Airport_feat,\
               Reason_feat,Distance_feat],axis=1)
y = df["CANCELLED"]
print(X.shape,y.shape)

(687412, 79) (687412,)


In [10]:
X.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,LBB,LRD,MAF,MFE,SAT,SJT,SPS,TYR,CANCELLATION_REASON,DISTANCE
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,2.0,429.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,1121.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,964.0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,308.0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,201.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,LBB,LRD,MAF,MFE,SAT,SJT,SPS,TYR,CANCELLATION_REASON,DISTANCE
335370,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,1121.0
346495,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,1372.0
9811,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,248.0
245444,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,569.0
219992,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,3784.0


In [12]:
y_df = y_train.to_frame()

In [13]:
# concatenate our training data back together
X = pd.concat([X_train, y_df], axis=1)

# separate minority and majority classes
not_cancelled = X[X.CANCELLED==0]
cancelled = X[X.CANCELLED==1]

# upsample minority
cancelled_upsampled = resample(cancelled,
                          replace=True, # sample with replacement
                          n_samples=len(not_cancelled), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_cancelled, cancelled_upsampled])

# check new class counts
upsampled.CANCELLED.value_counts()

1.0    505730
0.0    505730
Name: CANCELLED, dtype: int64

In [14]:
clf = RandomForestClassifier(n_estimators=100, max_depth=85,random_state=42, max_features = None,\
                             min_samples_leaf =3)

In [15]:
y_train = upsampled.CANCELLED
X_train = upsampled.drop('CANCELLED', axis=1)


In [16]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=85, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [17]:
clf.score(X_test, y_test)

0.9942101679924121

In [18]:
from sklearn.metrics import classification_report

In [19]:
predictions = clf.predict(X_test)

In [20]:
print(classification_report(y_test, predictions,target_names=["Not Cancelled", "Cancelled"]))

               precision    recall  f1-score   support

Not Cancelled       1.00      1.00      1.00    168554
    Cancelled       0.80      0.93      0.86      3299

  avg / total       0.99      0.99      0.99    171853



In [21]:
pred_series = pd.Series(predictions)

In [22]:
pred_series.to_csv('cancel_predictions.csv')

  """Entry point for launching an IPython kernel.


In [23]:
y_test.to_csv('cancel_actual.csv')

  """Entry point for launching an IPython kernel.
