# Airline Data Flight Example

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

pd.options.display.float_format = '{:,.2f}'.format
%matplotlib inline

print(sys.version)
print(pd.__version__)
print(np.__version__)

3.9.13 (main, Aug 25 2022, 18:29:29) 
[Clang 12.0.0 ]
1.4.4
1.21.5


---

### get dataframe prepared (with all the datawranging required -- recall the html issue)

In [8]:
# 1. import the new csv file

df1 = pd.read_csv("fresh_start_airline_gn.csv")

In [12]:
# 2. check the df

df1.head(5)

Unnamed: 0.1,Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,...,,,0,,0,,,,,
1,1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,...,,,0,,0,,,,,
2,2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,...,,,0,,0,,,,,
3,3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,...,,,0,,0,,,,,
4,4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,...,,,0,,0,,,,,


In [15]:
# 3. delete the unnamed columns added 

df = df1.loc[:, ~df1.columns.str.contains('^Unnamed')]

In [16]:
# 4. check the new df

df.head(5)

Unnamed: 0,Date,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1994-06-05,7,1542.0,1540,1819.0,1815,US,236,,157.0,...,,,0,,0,,,,,
1,1994-06-06,1,1549.0,1540,1831.0,1815,US,236,,162.0,...,,,0,,0,,,,,
2,1994-06-07,2,1540.0,1540,1803.0,1815,US,236,,143.0,...,,,0,,0,,,,,
3,1994-06-08,3,1541.0,1540,1808.0,1815,US,236,,147.0,...,,,0,,0,,,,,
4,1994-06-09,4,1541.0,1540,1835.0,1815,US,236,,174.0,...,,,0,,0,,,,,


---

# converting numerical to categorical data

In [17]:
df.FlightNum.describe()

count   426,490.00
mean        901.69
std         593.51
min           1.00
25%         407.00
50%         792.00
75%       1,429.00
max       3,219.00
Name: FlightNum, dtype: float64

In [19]:
# the average flight number data is useless

# it does not matter if it is flight 1 or flight 99

        # those are just categories
        # it could just as well be flight G

In [20]:
# pandas thinks about this as numerical data, not categorical data

In [23]:
# sometimes you have relatively continuous data tht should be categorical data
### these should be grouped as bins

##### maybe elapsed time is in this category
# wehre you say, 1-200 is one category

# you could say, 1-100: short hop, 100-200 - medium hop, 200-400: long hop

---

# bins

In [25]:
# departure and arrival times are in military format

# create them into bins

In [26]:
# this converts a continuous variable to a categorical one

In [27]:
# we are looking at actual departure times (not scheduled)

In [28]:
ranges = [0, 600, 1200, 1800, 2400]

In [29]:
labels = ['Early Morning', 'Morning', 'Early Afternoon', 'Evening']

## cut

In [30]:
# DepTime: actual
# CRSDEpTime: Scheduled

In [32]:
df['DepTime2'] = pd.cut(df.DepTime, ranges, labels=labels).astype('category')

In [33]:
df['ArrTime2'] = pd.cut(df.ArrTime, ranges, labels=labels).astype('category')

In [34]:
df.ArrTime2.head(5)

0    Evening
1    Evening
2    Evening
3    Evening
4    Evening
Name: ArrTime2, dtype: category
Categories (4, object): ['Early Morning' < 'Morning' < 'Early Afternoon' < 'Evening']

![image.png](attachment:e0f3337b-8277-4ffa-8235-432994184d10.png)

### the category dtype has ordering (like above morning < afternoon)

In [35]:
df[['DepTime2', 'ArrTime2']].describe()

Unnamed: 0,DepTime2,ArrTime2
count,423805,422641
unique,4,4
top,Morning,Early Afternoon
freq,163514,156806


In [36]:
#most common departure time:  morning

In [37]:
# most common arrival time: early afternoon

### you can get similar info from the value_counts()

In [38]:
df.DepTime2.value_counts()

Morning            163514
Early Afternoon    160505
Evening             93679
Early Morning        6107
Name: DepTime2, dtype: int64

In [39]:
df.ArrTime2.value_counts()

Early Afternoon    156806
Evening            137283
Morning            120417
Early Morning        8135
Name: ArrTime2, dtype: int64