# Airline Data Flight Example

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline

print(sys.version)
print(pd.__version__)
print(np.__version__)

3.9.13 (main, Aug 25 2022, 18:29:29) 
[Clang 12.0.0 ]
1.4.4
1.21.5


---

### get dataframe prepared (with all the datawranging required -- recall the html issue)

In [2]:
# 1. import the new csv file

df1 = pd.read_csv("fresh_start_airline_gn.csv")

In [3]:
# 2. check the df

df1.head(5)

Unnamed: 0.1,Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,...,,,0,,0,,,,,
1,1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,...,,,0,,0,,,,,
2,2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,...,,,0,,0,,,,,
3,3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,...,,,0,,0,,,,,
4,4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,...,,,0,,0,,,,,


In [4]:
# 3. delete the unnamed columns added 

df = df1.loc[:, ~df1.columns.str.contains('^Unnamed')]

In [5]:
# 4. check the new df

df.head(5)

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,157.0,...,,,0,,0,,,,,
1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,162.0,...,,,0,,0,,,,,
2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,143.0,...,,,0,,0,,,,,
3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,147.0,...,,,0,,0,,,,,
4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,174.0,...,,,0,,0,,,,,


---

# converting numerical to categorical data

In [6]:
df.FlightNum.describe()

count   426,490.00
mean        901.69
std         593.51
min           1.00
25%         407.00
50%         792.00
75%       1,429.00
max       3,219.00
Name: FlightNum, dtype: float64

In [7]:
# the average flight number data is useless

# it does not matter if it is flight 1 or flight 99

        # those are just categories
        # it could just as well be flight G

In [8]:
# pandas thinks about this as numerical data, not categorical data

In [9]:
# sometimes you have relatively continuous data tht should be categorical data
### these should be grouped as bins

##### maybe elapsed time is in this category
# wehre you say, 1-200 is one category

# you could say, 1-100: short hop, 100-200 - medium hop, 200-400: long hop

---

# bins

In [10]:
# departure and arrival times are in military format

# create them into bins

In [11]:
# this converts a continuous variable to a categorical one

In [12]:
# we are looking at actual departure times (not scheduled)

In [13]:
ranges = [0, 600, 1200, 1800, 2400]

In [14]:
labels = ['Early Morning', 'Morning', 'Early Afternoon', 'Evening']

## cut

In [15]:
# DepTime: actual
# CRSDEpTime: Scheduled

In [16]:
df['DepTime2'] = pd.cut(df.DepTime, ranges, labels=labels).astype('category')

In [17]:
df['ArrTime2'] = pd.cut(df.ArrTime, ranges, labels=labels).astype('category')

---

## average time delays

In [27]:
df.groupby(['DepTime2', 'ArrTime2'])[['DepDelay', 'ArrDelay']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,DepDelay,ArrDelay
DepTime2,ArrTime2,Unnamed: 2_level_1,Unnamed: 3_level_1
Early Morning,Early Morning,43.8,42.07
Early Morning,Morning,2.58,2.11
Early Morning,Early Afternoon,,
Early Morning,Evening,64.0,61.33
Morning,Early Morning,,
Morning,Morning,2.37,0.66
Morning,Early Afternoon,3.77,4.1
Morning,Evening,5.41,12.92
Early Afternoon,Early Morning,8.89,14.45
Early Afternoon,Morning,4.9,2.48


---

# variation in the delays

In [33]:
transformations = [np.mean, np.std]

In [34]:
agg_dict = {'ArrDelay': transformations,  'DepDelay' : transformations}

In [35]:
df.groupby(['DepTime2', 'ArrTime2']).agg(agg_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,ArrDelay,ArrDelay,DepDelay,DepDelay
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
DepTime2,ArrTime2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Early Morning,Early Morning,42.07,91.24,43.8,91.84
Early Morning,Morning,2.11,18.44,2.58,16.6
Early Morning,Early Afternoon,,,,
Early Morning,Evening,61.33,5.51,64.0,8.89
Morning,Early Morning,,,,
Morning,Morning,0.66,13.45,2.37,10.22
Morning,Early Afternoon,4.1,17.17,3.77,12.48
Morning,Evening,12.92,24.19,5.41,15.7
Early Afternoon,Early Morning,14.45,42.65,8.89,27.35
Early Afternoon,Morning,2.48,11.08,4.9,8.72


![image.png](attachment:e697056f-b0fc-407c-bfcc-e03c6681bef4.png)

---

# <font color = red> Start Here

# which flights are worst for delays

### rank flights with the worst delay times

In [37]:
df.head()

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2
0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,157.0,...,0,,0,,,,,,Early Afternoon,Evening
1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,162.0,...,0,,0,,,,,,Early Afternoon,Evening
2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,143.0,...,0,,0,,,,,,Early Afternoon,Evening
3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,147.0,...,0,,0,,,,,,Early Afternoon,Evening
4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,174.0,...,0,,0,,,,,,Early Afternoon,Evening


In [38]:
# write a function that takes in a df adn return a df with th eranks

In [39]:
# create a column wiht total delay time

In [40]:
# also create a new column that bins the delay time

In [41]:
df['TotalDelay'] = df.ArrDelay + df.DepDelay

### cut for binning

In [43]:
df['ScheduledDepTime'] = pd.cut(df.CRSDepTime, ranges, labels = labels)

In [45]:
df.head()

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime
0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,157.0,...,0,,,,,,Early Afternoon,Evening,6.0,Early Afternoon
1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,162.0,...,0,,,,,,Early Afternoon,Evening,25.0,Early Afternoon
2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,143.0,...,0,,,,,,Early Afternoon,Evening,-12.0,Early Afternoon
3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,147.0,...,0,,,,,,Early Afternoon,Evening,-6.0,Early Afternoon
4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,174.0,...,0,,,,,,Early Afternoon,Evening,21.0,Early Afternoon


---

In [47]:
# crete a new function for average delay

In [49]:
def average_delay(dataframe):
    dataframe['AvgFlightDelay'] = dataframe.TotalDelay.mean()
    return dataframe

---

## approach 1: simple

### execute function on the df dataframe

In [50]:
avg_delayed = df.groupby('FlightNum').apply(average_delay)

In [52]:
avg_delayed.head()

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime,AvgFlightDelay
0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,157.0,...,,,,,,Early Afternoon,Evening,6.0,Early Afternoon,13.61
1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,162.0,...,,,,,,Early Afternoon,Evening,25.0,Early Afternoon,13.61
2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,143.0,...,,,,,,Early Afternoon,Evening,-12.0,Early Afternoon,13.61
3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,147.0,...,,,,,,Early Afternoon,Evening,-6.0,Early Afternoon,13.61
4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,174.0,...,,,,,,Early Afternoon,Evening,21.0,Early Afternoon,13.61


## <font color = red> Sample

In [55]:
avg_delayed.sample()

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime,AvgFlightDelay
47291,1994-06-03,5,1947.0,1945,2014.0,2017,US,1763,,87.0,...,,,,,,Evening,Evening,-1.0,Evening,8.24


In [56]:
avg_delayed.sample(5)

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime,AvgFlightDelay
270360,1994-06-09,4,859.0,855,1028.0,1021,DL,1511,,89.0,...,,,,,,Morning,Morning,11.0,Morning,9.79
228266,1994-06-06,1,1142.0,1140,1230.0,1227,DL,436,,48.0,...,,,,,,Morning,Early Afternoon,5.0,Morning,17.59
374875,1994-06-12,7,1318.0,1320,1458.0,1459,AA,1979,,160.0,...,,,,,,Early Afternoon,Early Afternoon,-3.0,Early Afternoon,19.08
15299,1994-06-22,3,2358.0,2355,557.0,555,US,637,,239.0,...,,,,,,Evening,Early Morning,5.0,Evening,14.36
409430,1994-06-23,4,1750.0,1740,1927.0,1917,CO,1150,,157.0,...,,,,,,Early Afternoon,Evening,20.0,Early Afternoon,10.54


---

## approach 2: more compact

In [60]:
df2 = df.groupby('FlightNum').agg({'TotalDelay': np.mean})
df2

Unnamed: 0_level_0,TotalDelay
FlightNum,Unnamed: 1_level_1
1,6.40
2,7.01
3,9.38
4,17.79
5,6.25
...,...
3010,36.74
3197,13.24
3203,3.18
3218,5.56


In [65]:
#df2.sorted('TotalDelay')

df2.sort_values(by = ['TotalDelay'], ascending = False)

Unnamed: 0_level_0,TotalDelay
FlightNum,Unnamed: 1_level_1
2849,183.33
1428,173.78
2928,150.50
2835,124.33
2252,112.23
...,...
2314,-15.00
2628,-16.25
2280,-18.00
2282,-18.75


![image.png](attachment:7d3ef683-3009-4b6a-a6c8-d5f2678b3dc4.png)

In [78]:
df.groupby('FlightNum').agg({'TotalDelay': np.mean}).\
            sort_values(by = ['TotalDelay'], ascending = False)

Unnamed: 0_level_0,TotalDelay
FlightNum,Unnamed: 1_level_1
2849,183.33
1428,173.78
2928,150.50
2835,124.33
2252,112.23
...,...
2314,-15.00
2628,-16.25
2280,-18.00
2282,-18.75


---

## ranking function to cretae a generic way to do this

In [70]:
def ranking(dataframe, column):
    dataframe.sort_values(column, ascending = False, inplace = True)
    dataframe[column + 'Rank'] = np.arange(len(dataframe))+ 1
    return dataframe

---

# rank flights by aveage delay

# new df

In [75]:
avg_delayed_ranked = avg_delayed.groupby\
('ScheduledDepTime').\
apply(lambda x:ranking(x, 'AvgFlightDelay'))

In [76]:
avg_delayed_ranked.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime,AvgFlightDelay,AvgFlightDelayRank
ScheduledDepTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Early Morning,269530,1994-06-02,4,51.0,110,109.0,131,DL,1490,,18.0,...,,,,,Early Morning,Early Morning,-41.0,Early Morning,36.88,1
Early Morning,269529,1994-06-01,3,115.0,110,134.0,131,DL,1490,,19.0,...,,,,,Early Morning,Early Morning,8.0,Early Morning,36.88,2
Early Morning,269532,1994-06-04,6,108.0,110,127.0,131,DL,1490,,19.0,...,,,,,Early Morning,Early Morning,-6.0,Early Morning,36.88,3
Early Morning,269533,1994-06-05,7,100.0,110,122.0,131,DL,1490,,22.0,...,,,,,Early Morning,Early Morning,-19.0,Early Morning,36.88,4
Early Morning,269534,1994-06-06,1,58.0,110,113.0,131,DL,1490,,15.0,...,,,,,Early Morning,Early Morning,-30.0,Early Morning,36.88,5


In [74]:
avg_delayed_ranked[avg_delayed_ranked\
                   .AvgFlightDelayRank == 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DepTime2,ArrTime2,TotalDelay,ScheduledDepTime,AvgFlightDelay,AvgFlightDelayRank
ScheduledDepTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Early Morning,269530,1994-06-02,4,51.0,110,109.0,131,DL,1490,,18.0,...,,,,,Early Morning,Early Morning,-41.0,Early Morning,36.88,1
Morning,267020,1994-06-28,2,1158.0,1151,1253.0,1250,DL,1428,,55.0,...,,,,,Morning,Early Afternoon,10.0,Morning,173.78,1
Early Afternoon,60742,1994-06-12,7,1605.0,1600,1703.0,1659,US,2849,,58.0,...,,,,,Early Afternoon,Early Afternoon,9.0,Early Afternoon,183.33,1
Evening,266946,1994-06-09,4,1953.0,2005,2022.0,2039,DL,1428,,29.0,...,,,,,Evening,Evening,1411.0,Evening,173.78,1


#### flight 1428 has the worst in the morning and evening

![image.png](attachment:2749b745-7514-4fbb-b090-25625526f696.png)

---

## <font color = red> This format is cool

### <font color = blue> Doing itall in one line -- without intermediate DFs

In [79]:
df.groupby('FlightNum')\
.agg({'TotalDelay' : np.mean})\
.sort_values('TotalDelay', ascending=False)\
.head()

Unnamed: 0_level_0,TotalDelay
FlightNum,Unnamed: 1_level_1
2849,183.33
1428,173.78
2928,150.5
2835,124.33
2252,112.23


In [80]:
df.groupby('ScheduledDepTime')\
.agg({'TotalDelay' : np.mean})\
.sort_values('TotalDelay', ascending=False)\
.head()

Unnamed: 0_level_0,TotalDelay
ScheduledDepTime,Unnamed: 1_level_1
Evening,24.29
Early Afternoon,16.04
Morning,5.58
Early Morning,4.07
