All the packages that will be used on this dataset are imported.

In [83]:
import pandas as pd
import numpy as np
import sklearn as skl
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import random
import datetime

In [84]:
airlines = pd.read_csv('airlines.csv' , low_memory=False)
airports = pd.read_csv('airports.csv' , low_memory=False)
flights = pd.read_csv('flights.csv' , low_memory=False)

In [85]:
includedAirports = ["ATL","LAX","ORD","DFW","JFK","DEN","SFO","LAS","CLT","SEA","PHX","MIA","MCO",
                    "IAH","EWR","MSP","BOS","DTW","PHL","LGA","FLL","BWI","DCA","SLC","MDW"]
flights = flights[flights['ORIGIN_AIRPORT'].isin(includedAirports)]
flights = flights[flights['DESTINATION_AIRPORT'].isin(includedAirports)]
airports = airports[airports.IATA_CODE.isin(includedAirports)]

excludedAirlines = ["VX","MQ","HA","EV","US"]
flights = flights[~flights['AIRLINE'].isin(excludedAirlines)]

# flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])


In [86]:
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'TAIL_NUMBER']
flights.drop(variables_to_remove, axis = 1, inplace = True)

In [87]:
flights = pd.merge(flights, airlines, left_on='AIRLINE', right_on='IATA_CODE', how='left')
flights.drop('IATA_CODE', axis=1, inplace=True)
flights.rename(columns={'AIRLINE_x': 'AIRLINE_CODE','AIRLINE_y': 'AIRLINE'}, inplace=True)

In [92]:
flights.describe()

In [89]:
flights.dtypes == object

MONTH                  False
DAY                    False
DAY_OF_WEEK            False
AIRLINE_CODE            True
FLIGHT_NUMBER          False
ORIGIN_AIRPORT          True
DESTINATION_AIRPORT     True
SCHEDULED_DEPARTURE    False
DEPARTURE_TIME         False
DEPARTURE_DELAY        False
SCHEDULED_TIME         False
ELAPSED_TIME           False
AIR_TIME               False
DISTANCE               False
SCHEDULED_ARRIVAL      False
ARRIVAL_TIME           False
ARRIVAL_DELAY          False
DIVERTED               False
CANCELLED              False
CANCELLATION_REASON     True
AIR_SYSTEM_DELAY       False
SECURITY_DELAY         False
AIRLINE_DELAY          False
LATE_AIRCRAFT_DELAY    False
WEATHER_DELAY          False
AIRLINE                 True
dtype: bool

In [90]:
airline_rank_v01 = pd.DataFrame({'flight_volume' : flights.groupby(['AIRLINE'])['AIRLINE'].count()}).reset_index()
airline_rank_v01.sort_values("flight_volume", ascending=True, inplace=True)

flight_volume_total = airline_rank_v01['flight_volume'].sum()

airline_rank_v01['flight_pcnt'] = airline_rank_v01['flight_volume']/flight_volume_total

airline_rank_v02 = pd.DataFrame({'cancellation_rate' : flights.groupby(['AIRLINE'])['CANCELLED'].mean()}).reset_index()
airline_rank_v02.sort_values("cancellation_rate", ascending=False, inplace=True)
airline_rank_v03 = pd.DataFrame({'divertion_rate' : flights.groupby(['AIRLINE'])['DIVERTED'].mean()}).reset_index()
airline_rank_v03.sort_values("divertion_rate", ascending=False, inplace=True)
airline_rank_v1 = pd.merge(airline_rank_v01, airline_rank_v02, left_on='AIRLINE', right_on='AIRLINE', how='left')
airline_rank_v1 = pd.merge(airline_rank_v1, airline_rank_v03, left_on='AIRLINE', right_on='AIRLINE', how='left')
airline_rank_v1

Unnamed: 0,AIRLINE,flight_volume,flight_pcnt,cancellation_rate,divertion_rate
0,Frontier Airlines Inc.,41375,0.026367,0.006042,0.002344
1,Alaska Airlines Inc.,44908,0.028619,0.002561,0.001536
2,Skywest Airlines Inc.,62929,0.040103,0.01489,0.002431
3,Spirit Air Lines,62990,0.040142,0.018177,0.001651
4,JetBlue Airways,97298,0.062006,0.017935,0.002631
5,Southwest Airlines Co.,231645,0.147622,0.018205,0.003626
6,United Air Lines Inc.,274050,0.174646,0.015088,0.002864
7,Delta Air Lines Inc.,367014,0.23389,0.00624,0.002289
8,American Airlines Inc.,386966,0.246605,0.015818,0.003036


In [78]:
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['hatch.linewidth'] = 2.0  

fig = plt.figure(1, figsize=(11,6))
ax = sns.barplot(x="DEPARTURE_DELAY", y="AIRLINE", data=flights, color="lightskyblue", ci=None)
ax = sns.barplot(x="ARRIVAL_DELAY", y="AIRLINE", data=flights, color="r", hatch = '///',
                 alpha = 0.0, ci=None)
ax.yaxis.label.set_visible(False)
plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)',
           fontsize=14, weight = 'bold', labelpad=10);

In [79]:
flights_totals_month = pd.DataFrame({'total_flights_month' : flights.groupby(['MONTH'])['AIRLINE'].count()}).reset_index()
flights_totals_month.sort_values("MONTH", ascending=True, inplace=True)
flights_cancelled_month = pd.DataFrame({'total_cancellations_month' : flights.groupby(['MONTH'])['CANCELLED'].sum()}).reset_index()
flights_cancelled_month.sort_values("MONTH", ascending=True, inplace=True)

In [80]:
flights = pd.get_dummies(flights, columns=['AIRLINE','AIRLINE_CODE','ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'])
# le = preprocessing.LabelEncoder()
# le.fit(flights["AIRLINE"])
# flights["AIRLINE"] = le.transform(flights["AIRLINE"])
# flights["CANCELLATION_REASON"] = le.fit_transform(flights["CANCELLATION_REASON"])
# flights["AIR_SYSTEM_DELAY"] = le.fit_transform(flights["AIR_SYSTEM_DELAY"])
# flights["SECURITY_DELAY"] = le.fit_transform(flights["SECURITY_DELAY"])
# flights["AIRLINE_DELAY"] = le.fit_transform(flights["AIRLINE_DELAY"])
# flights["LATE_AIRCRAFT_DELAY"] = le.fit_transform(flights["LATE_AIRCRAFT_DELAY"])
# flights["WEATHER_DELAY"] = le.fit_transform(flights["WEATHER_DELAY"])
# flights["DESTINATION_AIRPORT"] = le.fit_transform(flights["DESTINATION_AIRPORT"])
# flights["ORIGIN_AIRPORT"] = le.fit_transform(flights["ORIGIN_AIRPORT"])
# flights["TAIL_NUMBER"] = le.fit_transform(flights["TAIL_NUMBER"])

In [81]:
# Labels are the values we want to predict
labels = np.array(flights["ARRIVAL_DELAY"])
# Remove the labels from the features
# axis 1 refers to the columns
features = flights.drop('ARRIVAL_DELAY', axis = 1)
features = flights.drop('CANCELLATION_REASON', axis = 1)
features = flights.drop('CANCELLATION_REASON', axis = 1)


# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)


X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state = 1313)
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (1255340, 89)
Training Labels Shape: (1255340,)
Testing Features Shape: (313835, 89)
Testing Labels Shape: (313835,)


In [82]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 1313)
# Train the model on training data
rf.fit(X_train, y_train);

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
flights.dtypes == 'object'