In [33]:
from __future__ import division
from statistics import mode
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import pdist, squareform
import numbers

### Reading data set

In [34]:
path = os.getcwd()
datasetpath = os.path.join(path,"data")
datasetpath = os.path.join(datasetpath,"2006.csv")
flights = pd.read_csv(datasetpath)
flights["Origin"].value_counts()["SJU"]

25183

In [35]:
l = flights.columns[flights.isnull().any()].tolist()
l

['DepTime',
 'ArrTime',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'CancellationCode']

In [36]:
flights.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [37]:
from collections import Counter
for col in l[:-1]:
    print(flights[col].isna().sum())

121934
138120
138120
4
138120
138120
121934


In [38]:
flights.iloc[1]

Year                   2006
Month                     1
DayofMonth               11
DayOfWeek                 3
DepTime                1053
CRSDepTime             1053
ArrTime                1313
CRSArrTime             1318
UniqueCarrier            US
FlightNum               613
TailNum              N834AW
ActualElapsedTime       260
CRSElapsedTime          265
AirTime                 214
ArrDelay                 -5
DepDelay                  0
Origin                  ATL
Dest                    PHX
Distance               1587
TaxiIn                   27
TaxiOut                  19
Cancelled                 0
CancellationCode        NaN
Diverted                  0
CarrierDelay              0
WeatherDelay              0
NASDelay                  0
SecurityDelay             0
LateAircraftDelay         0
Name: 1, dtype: object

In [39]:
# dropping column
flightdf = flights.drop("CancellationCode",axis=1)
flightdf = flightdf.dropna()
flightdf = flightdf[flightdf["Origin"] =='SJU']

In [40]:
flightdf = flightdf.drop(["Origin"],axis=1)
flightdf

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1373,2006,1,11,3,1635.0,1630,1943.0,1938,US,659,...,1674,7,10,0,0,0,0,0,0,0
1374,2006,1,11,3,1716.0,1720,2008.0,2016,US,791,...,1474,13,15,0,0,0,0,0,0,0
1375,2006,1,11,3,1530.0,1510,1803.0,1806,US,968,...,1474,4,14,0,0,0,0,0,0,0
1376,2006,1,11,3,707.0,700,946.0,1001,US,596,...,1576,7,11,0,0,0,0,0,0,0
1377,2006,1,11,3,1547.0,1530,1906.0,1830,US,1024,...,1576,4,32,0,0,17,0,19,0,0
1378,2006,1,11,3,1354.0,1345,1725.0,1648,US,1427,...,1576,4,13,0,0,9,0,28,0,0
2872,2006,1,12,4,1700.0,1630,1959.0,1938,US,659,...,1674,5,20,0,0,12,0,0,0,9
2873,2006,1,12,4,1717.0,1720,1952.0,2016,US,791,...,1474,4,20,0,0,0,0,0,0,0
2874,2006,1,12,4,1506.0,1510,1738.0,1806,US,968,...,1474,3,10,0,0,0,0,0,0,0
2875,2006,1,12,4,657.0,700,940.0,1001,US,596,...,1576,12,9,0,0,0,0,0,0,0


In [41]:
from sklearn.preprocessing import LabelEncoder
ce = LabelEncoder()

flightdf["UniqueCarrier"] = ce.fit_transform(flightdf["UniqueCarrier"])
flightdf["TailNum"] = ce.fit_transform(flightdf["TailNum"])
#flightdf["Origin"] = ce.fit_transform(flightdf["Origin"])
flightdf["Dest"] = ce.fit_transform(flightdf["Dest"])

In [51]:
print(flightdf.dtypes)
flightdf["DepDelay"]=flightdf["DepDelay"].map(lambda x: 1 if x > 15  else 0)


Year                   int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64
CRSArrTime             int64
UniqueCarrier          int64
FlightNum              int64
TailNum                int64
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay               int64
Dest                   int64
Distance               int64
TaxiIn                 int64
TaxiOut                int64
Cancelled              int64
Diverted               int64
CarrierDelay           int64
WeatherDelay           int64
NASDelay               int64
SecurityDelay          int64
LateAircraftDelay      int64
dtype: object
Year                   int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64


In [52]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(flightdf,test_size=0.25)
train_labels = train["DepDelay"]
test_labels = test["DepDelay"]

In [53]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train.drop("DepDelay",axis=1),train_labels)


GaussianNB(priors=None, var_smoothing=1e-09)

In [62]:
gnb_pred = gnb.predict(test.drop("DepDelay",axis=1))

In [63]:
(gnb_pred == test_labels).sum()/len(test_labels)

1.0

In [56]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(train.drop("DepDelay",axis=1),train_labels)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [64]:
dt_pred = dt.predict(test.drop("DepDelay",axis=1))

In [65]:
(dt_pred == test_labels).sum()/len(test_labels)

1.0

In [66]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(train.drop("DepDelay",axis=1),train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [67]:
knnpred = clf.predict(test.drop("DepDelay",axis=1))
(knnpred == test_labels).sum()/len(test_labels)

1.0

In [61]:
# with feature selection
from sklearn.ensemble import ExtraTreesClassifier
clf=ExtraTreesClassifier(n_estimators=10) # selecting the 10 best features
clf.fit(flightdf.drop("ActualElapsedTime",axis=1)[:500000],flightdf["ActualElapsedTime"][:500000])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [8]:
clf.feature_importances_# feature importance

array([0.        , 0.        , 0.06894343, 0.06190076, 0.06031324,
       0.03270838, 0.0679924 , 0.03359398, 0.01317906, 0.03443357,
       0.0685748 , 0.04411724, 0.10015107, 0.0784251 , 0.06234818,
       0.02555399, 0.02504725, 0.03665974, 0.06695169, 0.07933687,
       0.        , 0.        , 0.01174806, 0.00260269, 0.0151829 ,
       0.0001818 , 0.01005379])

In [7]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf,prefit=True)
new=model.transform(flightdf.drop("ActualElapsedTime",axis=1))
new.shape
labels = flightdf["ActualElapsedTime"]
frames = [pd.DataFrame(new),labels]
new_df = pd.concat(frames,axis=1)
new_df




from sklearn.model_selection import train_test_split
train,test = train_test_split(flightdf,test_size=0.25)
train_labels = train["ActualElapsedTime"]
test_labels = test["ActualElapsedTime"]


In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree


dt = tree.DecisionTreeRegressor()
dt.fit(train.drop("ActualElapsedTime",axis=1)[:200000],train_labels[:200000])
dt_pred = dt.predict(test.drop("ActualElapsedTime",axis=1)[:200000])
acc = (dt_pred == test_labels[:200000]).sum()/len(test_labels[:200000])
print(acc)

0.566585


In [10]:
gnb = GaussianNB()
gnb.fit(train.drop("ActualElapsedTime",axis=1)[:500000],train_labels[:500000])
gnb_pred = gnb.predict(test.drop("ActualElapsedTime",axis=1)[:500000])
acc = (gnb_pred == test_labels[:500000]).sum()/len(test_labels[:500000])
print(acc)

0.036018


In [16]:
clf = KNeighborsClassifier()
clf.fit(train.drop("ActualElapsedTime",axis=1)[:200000],train_labels[:200000])
knnpred = clf.predict(test.drop("ActualElapsedTime",axis=1)[:200000])
acc =(knnpred == test_labels[:200000]).sum()/len(test_labels[:200000])
print(acc)

0.02624


In [18]:
set(flightdf["CRSElapsedTime"])

{-25.0,
 12.0,
 19.0,
 20.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108.0,
 109.0,
 110.0,
 111.0,
 112.0,
 113.0,
 114.0,
 115.0,
 116.0,
 117.0,
 118.0,
 119.0,
 120.0,
 121.0,
 122.0,
 123.0,
 124.0,
 125.0,
 126.0,
 127.0,
 128.0,
 129.0,
 130.0,
 131.0,
 132.0,
 133.0,
 134.0,
 135.0,
 136.0,
 137.0,
 138.0,
 139.0,
 140.0,
 141.0,
 142.0,
 143.0,
 144.0,
 145.0,
 146.0,
 147.0,
 148.0,
 149.0,
 150.0,
 151.0,
 152.0,
 