In [None]:
import pandas as pd


def block_break(datas: any) -> None:
    """
    Prints whatever the user inputs as a parameter
    followed by a line break of '-' marks to clearly demarc each section
    (meant to help see sections more clearly in my IDE when working)
    :param datas: anything that can be printed
    :return: None
    """
    print(datas)
    print("-" * 25)


In [None]:
# read the original data file
dataset = pd.read_csv('nycflights.csv')
# copy the original dataset as to not accidentally corrupt the information
data = dataset.copy()
# print the column headers for ease of use later on
block_break(data.columns)

In [None]:
# How many flights were operated by American Airlines and how many by Delta Air lines?
carriers = data['carrier'].value_counts()
print(f'# of American Airlines flights: {carriers["AA"]}')
block_break(f'# of Delta Airlines flights: {carriers["DL"]}')


In [None]:
# Identify the top destinations from each JFK airport.
# grab all the entries from the origin column that have the value JFK
from_JFK = data[data['origin'] == 'JFK']
# count the number of unique dest values that have the origin of JFK (similar to groupby)
top_dest = from_JFK[['origin', 'dest']].value_counts()
# print the top five entries
block_break(top_dest[:5])

In [None]:
# Count the number of flights departing from each origin airport to different destinations.
# group the dataset by origin and find the number of unique (non-na) destinations
num_of_dest = data.groupby('origin')['dest'].nunique()
block_break(num_of_dest)

In [None]:
# How missing values are there in the column dep_time?
# get the sum of row that are na in the dep_time column
dep_na = data['dep_time'].isna().sum()
block_break(f'The number of missing departure times is: {dep_na}')

In [None]:
# Identify the airline with the highest average departure delay.
# group the carrier column by the average dep_delay time and sort it in descending order
# groupby will grab each unique entry in the first identifier ('carrier')
# ['dep_delay'].mean() will average the values of each carrier's delay time
# sort_values(ascending=False) will sort the entries by value in descending order
avg_dep_del = (data.groupby('carrier')['dep_delay'].mean()).sort_values(ascending=False)
max_delay_carrier, max_del_avg = avg_dep_del.index[0], avg_dep_del.iloc[0]
print(f'Carrier with the highest average departure delay: {max_delay_carrier}')
block_break(f'\t\t\t\t\t\t\t\t\t   Avg delay: {max_del_avg:.2f} min')

In [None]:
# Count the number of flights departing from each origin airport to different destinations.
# group the dataset by origin and find the number of unique (non-na) destinations
num_of_dest = data.groupby('origin')['dest'].nunique()
block_break(num_of_dest)

In [None]:
# Extract the hour, minutes from the 'DEP_TIME' column and add put it into new column with appropriate names
# copy the original dataset prior to manipulation
dataset2 = dataset.copy()
# convert the values in the dep_time column to str type
dataset2['dep_time'] = dataset2['dep_time'].astype(str)
# create 1D arrays dep_hour and min by parsing the indexed positions of the dep_time values
dep_hour = dataset2['dep_time'].str[:-2]
dep_min = dataset2['dep_time'].str[-2:]
# insert the new arrays as columns, into the dataset
dataset2.insert(3, 'dep_hour', dep_hour)
dataset2.insert(4, 'dep_min', dep_min)
# remove the dep_time column
del dataset2['dep_time']
block_break(dataset2)