# Project 2: Are we missing JSON on our flight?

In [104]:
import pandas as pd   # to load and transform data
import numpy as np    # for math/stat calculations
import altair as alt

# from file to pandas dataframe
flights = pd.read_json("flights_missing.json")


## 1. Which airport has the worst delays?

In [108]:

prop_total_delays = (flights.groupby('airport_code')
    .agg(total_flights = ('num_of_flights_total', sum),
        total_delays = ('num_of_delays_total', sum),
        total_delay_mins = ('minutes_delayed_total', sum))
    .assign(prop_delays = lambda x: x.total_delays / x.total_flights,
            avg_delay_hours = lambda x: x.total_delay_mins / x.total_delays / 60)
    .sort_values(by = ["prop_delays", "avg_delay_hours"], ascending = False)
)

print(prop_total_delays.to_markdown(floatfmt='.2f'))

| airport_code   |   total_flights |   total_delays |   total_delay_mins |   prop_delays |   avg_delay_hours |
|:---------------|----------------:|---------------:|-------------------:|--------------:|------------------:|
| SFO            |      1630945.00 |      425604.00 |        26550493.00 |          0.26 |              1.04 |
| ORD            |      3597588.00 |      830825.00 |        56356129.00 |          0.23 |              1.13 |
| ATL            |      4430047.00 |      902443.00 |        53983926.00 |          0.20 |              1.00 |
| IAD            |       851571.00 |      168467.00 |        10283478.00 |          0.20 |              1.02 |
| SAN            |       917862.00 |      175132.00 |         8276248.00 |          0.19 |              0.79 |
| DEN            |      2513974.00 |      468519.00 |        25173381.00 |          0.19 |              0.90 |
| SLC            |      1403384.00 |      205160.00 |        10123371.00 |          0.15 |              0.82 |


## 2. What is the best month to fly if you want to avoid delays of any length? 

In [109]:
# Create a new DataFrame without those rows that contains "n/a" under the column 'month'
clear_months = (flights.drop(flights.index[flights["month"] == "n/a"]))

# Create a DataFrame where the rows are grouped by airport_code and month
delays_airport_month = (clear_months.groupby(by=["airport_code", "month"])
        .agg(total_delays_per_month = ("num_of_delays_total",sum))
        .reset_index()
        )

# Create a chart representing how many delays have had each airport for each month
base_chart = (alt.Chart(delays_airport_month)
             .encode(
                x = alt.X("month",
                        title = "Month",
                        sort=["January", "Febuary", "March", "April", "May", "June",
                                    "July", "August", "September", "October", "November", "December"]),
                y = alt.Y("total_delays_per_month",
                        title = "Total of Delays per Airport"),

                color = "airport_code"
            )
            .mark_line()
            .configure_axis(
                    grid=True)
        .properties(title = "Most airports report an increase of delays on June and July"))

base_chart

In [110]:
delays_month_proportion = (clear_months.groupby('month')
    .agg(total_flights = ('num_of_flights_total', sum),
        total_delays = ('num_of_delays_total', sum))
    .assign(prop_delays = lambda x: x.total_delays / x.total_flights *100)
    .reset_index()
)

base_chart2 = (alt.Chart(delays_month_proportion)
                .encode(
                    x = alt.X("month",
                            title = "Month",
                            sort=["January", "Febuary", "March", "April", "May", "June",
                                    "July", "August", "September", "October", "November", "December"]),
                    y = alt.Y("prop_delays",
                            title="Total proportion of delays (%)",
                            scale=alt.Scale(zero=False)))                   
                .mark_bar()
                .configure_axis(
                    grid=True)
                .properties(title = """In proportion, December is the month with most delayed flights"""))

base_chart2

## 3. Create a new column that calculates the total number of flights delayed by weather (both severe and mild). 

In [119]:
# Calculate the average of the column um_of_Delays_late_aircraft
late_aircraft_avg = flights.num_of_delays_late_aircraft.replace(-999,np.nan).mean()

# Copy the orifinal DataFrame
q3 = flights

# Replace all the invalid values -999 with a standard missing value only in the column "num_of_delays_late_aircraft"
q3.num_of_delays_late_aircraft.replace(-999, late_aircraft_avg, inplace = True)

# Calculate how many flights were delayed because of severe and mild weather conditions
weather = q3.assign(
    severe_weather = q3.num_of_delays_weather,
    mild_weather_late_aircraft = .3*q3.num_of_delays_late_aircraft,
    mild_weather_nas = np.where(q3.month.isin(["April","May","June","July","August"]), 
                                    .4*q3.num_of_delays_nas, 
                                    .65*q3.num_of_delays_nas),
    num_delays_weather =  lambda x: x.severe_weather + x.mild_weather_late_aircraft + x.mild_weather_nas

)

print(weather[["airport_code","severe_weather","mild_weather_late_aircraft","mild_weather_nas","num_delays_weather"]].head().to_markdown())

|    | airport_code   |   severe_weather |   mild_weather_late_aircraft |   mild_weather_nas |   num_delays_weather |
|---:|:---------------|-----------------:|-----------------------------:|-------------------:|---------------------:|
|  0 | ATL            |              448 |                      332.731 |            2988.7  |              3769.43 |
|  1 | DEN            |              233 |                      278.4   |             607.75 |              1119.15 |
|  2 | IAD            |               61 |                      317.4   |             581.75 |               960.15 |
|  3 | ORD            |              306 |                      676.5   |            3519.75 |              4502.25 |
|  4 | SAN            |               56 |                      204     |             414.7  |               674.7  |


## 4. Create a barplot showing the proportion of all flights that are delayed by weather at each airport.

In [59]:
# Create a DataFrame that contains the proportion of delays caused by weather conditions grouped by airport
delays_by_weather_by_airport = (weather.groupby('airport_code')
    .agg(total_flights = ('num_of_flights_total'),
        total_delays_weather = ('num_delays_weather', sum))
    .assign(prop_delays_weather = lambda x: x.total_delays_weather / x.total_flights *100 )
    .reset_index()
    
)


delays_by_weather_by_airport

Unnamed: 0,airport_code,total_flights,total_delays_weather,prop_delays_weather
0,ATL,4430047,314800.623982,7.106034
1,DEN,2513974,149106.95,5.931125
2,IAD,851571,50842.65,5.970453
3,ORD,3597588,309954.074887,8.615608
4,SAN,917862,48920.55,5.329837
5,SFO,1630945,159593.7,9.785351
6,SLC,1403384,60345.75,4.300017


In [121]:
base_chart3 = (alt.Chart(delays_by_weather_by_airport)
                    .encode(
                        x = alt.X("airport_code",
                        title="Airport Code"),
                        y = alt.Y("prop_delays_weather",
                        title="Proportion of delays by weather(%)"),
                        color = "airport_code"
                    )
                    .mark_bar()
                    .properties(title="Less than 10% of all the delayed flights were caused by weather conditions")
)

base_chart3

## 5. Fix all of the varied missing data types in the data to be consistent

In [67]:
import json

# Replace all the missing values with NaN
consistent_flights = flights.replace([-999,"1500+","","n/a"], np.nan)
consistent_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   airport_code                   924 non-null    object 
 1   airport_name                   868 non-null    object 
 2   month                          897 non-null    object 
 3   year                           901 non-null    float64
 4   num_of_flights_total           924 non-null    int64  
 5   num_of_delays_carrier          851 non-null    object 
 6   num_of_delays_late_aircraft    924 non-null    float64
 7   num_of_delays_nas              924 non-null    int64  
 8   num_of_delays_security         924 non-null    int64  
 9   num_of_delays_weather          924 non-null    int64  
 10  num_of_delays_total            924 non-null    int64  
 11  minutes_delayed_carrier        872 non-null    float64
 12  minutes_delayed_late_aircraft  924 non-null    int

In [93]:
# Create a DataFrame with only those rows that contain NaN values
nan_flights= consistent_flights[consistent_flights.isna().any(axis=1)]

# Using the DataFrame created above, create a json document
json_flights = nan_flights.to_json(orient="records")
json_object = json.loads(json_flights)
json_formatted_str = json.dumps(json_object, indent = 4)
print(json_formatted_str)


[
    {
        "airport_code": "ATL",
        "airport_name": "Atlanta, GA: Hartsfield-Jackson Atlanta International",
        "month": "January",
        "year": 2005.0,
        "num_of_flights_total": 35048,
        "num_of_delays_carrier": null,
        "num_of_delays_late_aircraft": 1109.1040723982,
        "num_of_delays_nas": 4598,
        "num_of_delays_security": 10,
        "num_of_delays_weather": 448,
        "num_of_delays_total": 8355,
        "minutes_delayed_carrier": 116423.0,
        "minutes_delayed_late_aircraft": 104415,
        "minutes_delayed_nas": 207467.0,
        "minutes_delayed_security": 297,
        "minutes_delayed_weather": 36931,
        "minutes_delayed_total": 465533
    },
    {
        "airport_code": "IAD",
        "airport_name": null,
        "month": "January",
        "year": 2005.0,
        "num_of_flights_total": 12381,
        "num_of_delays_carrier": "414",
        "num_of_delays_late_aircraft": 1058.0,
        "num_of_delays_nas": 895,
  