In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
airline = pd.read_csv("2008.csv", usecols=["Month", "UniqueCarrier", "Origin", "Dest", 
                                           "DepDelay", "AirTime", "Distance"])
airline = airline.sample(frac=.1)

In [3]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700973 entries, 3887887 to 1406442
Data columns (total 7 columns):
Month            700973 non-null int64
UniqueCarrier    700973 non-null object
AirTime          685612 non-null float64
DepDelay         687522 non-null float64
Origin           700973 non-null object
Dest             700973 non-null object
Distance         700973 non-null int64
dtypes: float64(2), int64(2), object(3)
memory usage: 42.8+ MB


In [4]:
airline.head()

Unnamed: 0,Month,UniqueCarrier,AirTime,DepDelay,Origin,Dest,Distance
3887887,7,US,122.0,39.0,PHL,TPA,920
4833325,8,DL,142.0,2.0,MIA,JFK,1090
6575255,12,XE,118.0,106.0,STL,EWR,872
1292460,3,XE,69.0,85.0,CMH,EWR,462
4420846,8,OO,55.0,5.0,MKE,CLE,328


In [5]:
airline['UniqueCarrier'] = airline['UniqueCarrier'].astype('category')
airline['Origin'] = airline['Origin'].astype('category')
airline['Dest'] = airline['Dest'].astype('category')

In [6]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700973 entries, 3887887 to 1406442
Data columns (total 7 columns):
Month            700973 non-null int64
UniqueCarrier    700973 non-null category
AirTime          685612 non-null float64
DepDelay         687522 non-null float64
Origin           700973 non-null category
Dest             700973 non-null category
Distance         700973 non-null int64
dtypes: category(3), float64(2), int64(2)
memory usage: 30.1 MB


In [7]:
airline['Late'] = airline['DepDelay'] > 30

In [8]:
airline = airline.drop(['DepDelay'], axis=1)

In [9]:
carriers = pd.get_dummies(airline['UniqueCarrier'], prefix='Carrier_')
origins = pd.get_dummies(airline['Origin'], prefix='Origin_')
dests = pd.get_dummies(airline['Dest'], prefix='Dest_')

In [10]:
airline = airline.drop(['UniqueCarrier', 'Origin', 'Dest'], axis=1)

In [11]:
airline = pd.concat([carriers, origins, dests, airline], axis=1)

In [12]:
airtime_mean = airline['AirTime'].mean()
airline['AirTime'] = airline['AirTime'].fillna(airtime_mean)

In [13]:
train, test = train_test_split(airline, test_size=0.25, random_state=42)

In [14]:
feat_cols = airline.drop(['Late'], axis=1).columns

In [17]:
gbc = GradientBoostingClassifier(max_features=100, verbose=True)
Y = train['Late']
X = train[feat_cols]
gbc.fit(X, Y)

      Iter       Train Loss   Remaining Time 
         1           0.7218            1.86m
         2           0.7208            1.83m
         3           0.7197            1.83m
         4           0.7186            1.77m
         5           0.7179            1.77m
         6           0.7172            1.75m
         7           0.7165            1.74m
         8           0.7160            1.71m
         9           0.7155            1.69m
        10           0.7141            1.68m
        20           0.7101            1.49m
        30           0.7066            1.36m
        40           0.7042            1.16m
        50           0.7026           57.55s
        60           0.7010           46.23s
        70           0.6999           34.60s
        80           0.6986           23.23s
        90           0.6979           11.56s
       100           0.6967            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=100, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0,
              verbose=True, warm_start=False)

In [18]:
Y_test = test['Late']
X_test = test[feat_cols]

In [19]:
gbc.score(X_test, Y_test)

0.88217000296729131