## Project 3 Data Cleaning

In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Use Pandas to the read data.
data_df = pd.read_csv("2023_data.csv")
data_df.head()

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,12,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",72.0,5.0,2.46,1.0,...,0.0,0.81,0.0,0.0,672.0,61.0,574.0,20.0,0.0,17.0
1,2023,12,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,7.0,4.25,0.0,...,0.0,1.75,0.0,0.0,348.0,252.0,0.0,33.0,0.0,63.0
2,2023,12,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",95.0,10.0,5.94,0.0,...,0.0,3.0,0.0,0.0,859.0,536.0,0.0,47.0,0.0,276.0
3,2023,12,9E,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",23.0,2.0,0.56,0.0,...,0.0,1.44,1.0,0.0,75.0,9.0,0.0,0.0,0.0,66.0
4,2023,12,9E,Endeavor Air Inc.,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",2111.0,256.0,76.88,8.75,...,0.0,117.94,1.0,0.0,21424.0,8906.0,732.0,1487.0,0.0,10299.0


In [3]:
data_df.count()

year                   22621
month                  22621
carrier                22621
carrier_name           22621
airport                22621
airport_name           22621
arr_flights            22574
arr_del15              22569
carrier_ct             22574
weather_ct             22574
nas_ct                 22574
security_ct            22574
late_aircraft_ct       22574
arr_cancelled          22574
arr_diverted           22574
arr_delay              22574
carrier_delay          22574
weather_delay          22574
nas_delay              22574
security_delay         22574
late_aircraft_delay    22574
dtype: int64

In [4]:
# Review all rows with null values
null_mask = data_df.isnull().any(axis=1)
null_rows = data_df[null_mask]

null_rows

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
779,2023,12,UA,United Air Lines Network,BHM,"Birmingham, AL: Birmingham-Shuttlesworth Inter...",,,,,...,,,,,,,,,,
1254,2023,12,G7,GoJet Airlines LLC d/b/a United Express,SBN,"South Bend, IN: South Bend International",,,,,...,,,,,,,,,,
1423,2023,12,YV,Mesa Airlines Inc.,ROA,"Roanoke, VA: Roanoke Blacksburg Regional",,,,,...,,,,,,,,,,
1470,2023,12,YX,Republic Airline,ICT,"Wichita, KS: Wichita Dwight D Eisenhower National",,,,,...,,,,,,,,,,
2278,2023,11,C5,CommuteAir LLC dba CommuteAir,BTV,"Burlington, VT: Burlington International",,,,,...,,,,,,,,,,
2311,2023,11,C5,CommuteAir LLC dba CommuteAir,PVD,"Providence, RI: Rhode Island Tf Green Internat...",,,,,...,,,,,,,,,,
5472,2023,10,YV,Mesa Airlines Inc.,PNS,"Pensacola, FL: Pensacola International",,,,,...,,,,,,,,,,
7901,2023,8,C5,CommuteAir LLC dba CommuteAir,DAY,"Dayton, OH: James M Cox/Dayton International",,,,,...,,,,,,,,,,
7914,2023,8,C5,CommuteAir LLC dba CommuteAir,LBB,"Lubbock, TX: Lubbock Preston Smith International",,,,,...,,,,,,,,,,
7929,2023,8,C5,CommuteAir LLC dba CommuteAir,PWM,"Portland, ME: Portland International Jetport",,,,,...,,,,,,,,,,


#### 47 rows have all null (NaN) values for the fields containing flight data (columns arr_flights through late_aircraft_delay). These rows have no flight data at all so deleting them would not have a significant effect on the whole data set. 

####  5 rows have a null value in arr_del15 column only; in all 5 instances this is because there were only cancelled or diverted flights (no flights arrived, therefore the correct value is NA as there were no on-time arrivals or late arrivals). Because 5 out of 22621 data points is negligible, recommend also removing these 5 rows so the data set contains no null values.

In [5]:
# drop rows with null values
data_df = data_df.dropna()

In [6]:
# check columns counts to ensure they all match
data_df.count()

year                   22569
month                  22569
carrier                22569
carrier_name           22569
airport                22569
airport_name           22569
arr_flights            22569
arr_del15              22569
carrier_ct             22569
weather_ct             22569
nas_ct                 22569
security_ct            22569
late_aircraft_ct       22569
arr_cancelled          22569
arr_diverted           22569
arr_delay              22569
carrier_delay          22569
weather_delay          22569
nas_delay              22569
security_delay         22569
late_aircraft_delay    22569
dtype: int64

In [7]:
# drop unnecessary columns
data_df.columns

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')

In [8]:
data_df = data_df.drop(columns=['year', 'carrier'])
data_df.columns

Index(['month', 'carrier_name', 'airport', 'airport_name', 'arr_flights',
       'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct',
       'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay'],
      dtype='object')

In [9]:
# rename columns to be more intuitive
data_df = data_df.rename(columns={
    'arr_flights':'total_arrivals',
    'arr_del15':'total_delays_ct',
    'nas_ct':'nat_air_sys_ct',
    'arr_cancelled':'flight_cancelled', 
    'arr_diverted':'flight_diverted',
    'arr_delay':'total_delays_min',
    'carrier_delay':'carrier_delay_min',
    'weather_delay': 'weather_delay_min',
    'nas_delay':'nat_air_sys_delay_min',
    'security_delay':'security_delay_min',
    'late_aircraft_delay':'late_aircraft_delay_min'   

    })
data_df.head()

Unnamed: 0,month,carrier_name,airport,airport_name,total_arrivals,total_delays_ct,carrier_ct,weather_ct,nat_air_sys_ct,security_ct,late_aircraft_ct,flight_cancelled,flight_diverted,total_delays_min,carrier_delay_min,weather_delay_min,nat_air_sys_delay_min,security_delay_min,late_aircraft_delay_min
0,12,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",72.0,5.0,2.46,1.0,0.73,0.0,0.81,0.0,0.0,672.0,61.0,574.0,20.0,0.0,17.0
1,12,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,7.0,4.25,0.0,1.0,0.0,1.75,0.0,0.0,348.0,252.0,0.0,33.0,0.0,63.0
2,12,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",95.0,10.0,5.94,0.0,1.06,0.0,3.0,0.0,0.0,859.0,536.0,0.0,47.0,0.0,276.0
3,12,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",23.0,2.0,0.56,0.0,0.0,0.0,1.44,1.0,0.0,75.0,9.0,0.0,0.0,0.0,66.0
4,12,Endeavor Air Inc.,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",2111.0,256.0,76.88,8.75,52.43,0.0,117.94,1.0,0.0,21424.0,8906.0,732.0,1487.0,0.0,10299.0


In [10]:
#split data in airport_name into two columns
data_df[['city', 'name']] = data_df['airport_name'].str.split(': ', expand=True)
data_df.head()

Unnamed: 0,month,carrier_name,airport,airport_name,total_arrivals,total_delays_ct,carrier_ct,weather_ct,nat_air_sys_ct,security_ct,...,flight_cancelled,flight_diverted,total_delays_min,carrier_delay_min,weather_delay_min,nat_air_sys_delay_min,security_delay_min,late_aircraft_delay_min,city,name
0,12,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",72.0,5.0,2.46,1.0,0.73,0.0,...,0.0,0.0,672.0,61.0,574.0,20.0,0.0,17.0,"Allentown/Bethlehem/Easton, PA",Lehigh Valley International
1,12,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,7.0,4.25,0.0,1.0,0.0,...,0.0,0.0,348.0,252.0,0.0,33.0,0.0,63.0,"Alexandria, LA",Alexandria International
2,12,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",95.0,10.0,5.94,0.0,1.06,0.0,...,0.0,0.0,859.0,536.0,0.0,47.0,0.0,276.0,"Augusta, GA",Augusta Regional at Bush Field
3,12,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",23.0,2.0,0.56,0.0,0.0,0.0,...,1.0,0.0,75.0,9.0,0.0,0.0,0.0,66.0,"Albany, NY",Albany International
4,12,Endeavor Air Inc.,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",2111.0,256.0,76.88,8.75,52.43,0.0,...,1.0,0.0,21424.0,8906.0,732.0,1487.0,0.0,10299.0,"Atlanta, GA",Hartsfield-Jackson Atlanta International


In [11]:
#drop airport_name and name columns 
data_df = data_df.drop(columns=['airport_name', 'name'])
data_df.columns

Index(['month', 'carrier_name', 'airport', 'total_arrivals', 'total_delays_ct',
       'carrier_ct', 'weather_ct', 'nat_air_sys_ct', 'security_ct',
       'late_aircraft_ct', 'flight_cancelled', 'flight_diverted',
       'total_delays_min', 'carrier_delay_min', 'weather_delay_min',
       'nat_air_sys_delay_min', 'security_delay_min',
       'late_aircraft_delay_min', 'city'],
      dtype='object')

In [13]:
# move city column to position following airport column
data_df = data_df[['month', 'carrier_name', 'airport', 'city', 'total_arrivals', 'total_delays_ct',
       'carrier_ct', 'weather_ct', 'nat_air_sys_ct', 'security_ct',
       'late_aircraft_ct', 'flight_cancelled', 'flight_diverted', 'total_delays_min',
       'carrier_delay_min', 'weather_delay_min', 'nat_air_sys_delay_min',
       'security_delay_min', 'late_aircraft_delay_min']]
data_df.head()

Unnamed: 0,month,carrier_name,airport,city,total_arrivals,total_delays_ct,carrier_ct,weather_ct,nat_air_sys_ct,security_ct,late_aircraft_ct,flight_cancelled,flight_diverted,total_delays_min,carrier_delay_min,weather_delay_min,nat_air_sys_delay_min,security_delay_min,late_aircraft_delay_min
0,12,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA",72.0,5.0,2.46,1.0,0.73,0.0,0.81,0.0,0.0,672.0,61.0,574.0,20.0,0.0,17.0
1,12,Endeavor Air Inc.,AEX,"Alexandria, LA",62.0,7.0,4.25,0.0,1.0,0.0,1.75,0.0,0.0,348.0,252.0,0.0,33.0,0.0,63.0
2,12,Endeavor Air Inc.,AGS,"Augusta, GA",95.0,10.0,5.94,0.0,1.06,0.0,3.0,0.0,0.0,859.0,536.0,0.0,47.0,0.0,276.0
3,12,Endeavor Air Inc.,ALB,"Albany, NY",23.0,2.0,0.56,0.0,0.0,0.0,1.44,1.0,0.0,75.0,9.0,0.0,0.0,0.0,66.0
4,12,Endeavor Air Inc.,ATL,"Atlanta, GA",2111.0,256.0,76.88,8.75,52.43,0.0,117.94,1.0,0.0,21424.0,8906.0,732.0,1487.0,0.0,10299.0


In [14]:
# write cleaned data to new csv file
data_df.to_csv("2023_data_cleaned.csv", index=False, header=True)

#### 2023_data_cleaned.csv will be imported into pgAdmin SQL database for further data exploration