## Airline Analysis - Project 1 - Adrian Santos  
#### Last updated: 2023-11-01 -- v0.1

### Import data file, process loaded in DataFrame, scrub for nulls, recast date variable, & reorder columns

In [1]:
# Initial set-up - Import libraries
import pandas as pd

In [2]:
# Load airline route data into DataFrame - define datatype for integers
df = pd.read_table('./Resources/flight_edges.tsv', 
     names=['Origin Airport', 
            'Destination Airport', 
            'Origin City', 
            'Destination City',                                          
            'Passengers', 
            'Seats', 
            'Flights', 
            'Distance', 
            'Fly Date', 
            'Origin Population', 
            'Destination Population',
           ],
     dtype={'Passengers': int, 
            'Seats': int, 
            'Flights': int, 
            'Distance': int, 
            'Fly Date': str, 
            'Origin Population': int, 
            'Destination Population': int
           },
     chunksize=1000000
                  )

In [3]:
# Recast 'Fly Date' from YYYYMM format to separate columns of 'Month' and 'Year'
def flydate_recast(date:str):
    date = list(date)
    monthList = date[-2:]
    month = ''
    month = month.join(monthList)
    month = str(month)
    yearList = date[:4]
    year = ''
    year = year.join(yearList)
    year = str(year)
    return month, year

In [4]:
# Create an empty list to store the chunks of 1,000,000 rows at a time
chunks = []

In [5]:
# Iterate over each chunk of data
for chunk in df:
    
    # Drop rows with null values
    chunk.dropna(inplace=True)
    
    # Recast 'Fly Date' from YYYYMM format to columns for 'Month' and 'Year'
    chunk[['Month', 'Year']] = chunk['Fly Date'].apply(lambda x: pd.Series(flydate_recast(x)))
    
    # Append the modified chunk to the list
    chunks.append(chunk)

In [6]:
# Concatenate all of the chunks into one DataFrame
df = pd.concat(chunks)

In [7]:
# Return the number of rows and columns in the DataFrame
df.shape

(3606803, 13)

In [8]:
# Reorder column order so 'Month' and 'Year' are repositioned to come after 'Fly Date'
column_reorder = ['Origin Airport', 
             'Destination Airport', 
             'Origin City', 
             'Destination City', 
             'Passengers', 
             'Seats', 
             'Flights', 
             'Distance', 
             'Fly Date', 
             'Month', 
             'Year', 
             'Origin Population', 
             'Destination Population'
            ]
df = df.reindex(columns=column_reorder)

In [9]:
# Return updated DataFrame
df.head()

Unnamed: 0,Origin Airport,Destination Airport,Origin City,Destination City,Passengers,Seats,Flights,Distance,Fly Date,Month,Year,Origin Population,Destination Population
0,MHK,AMW,"Manhattan, KS","Ames, IA",21,30,1,254,200810,10,2008,122049,86219
1,EUG,RDM,"Eugene, OR","Bend, OR",41,396,22,103,199011,11,1990,284093,76034
2,EUG,RDM,"Eugene, OR","Bend, OR",88,342,19,103,199012,12,1990,284093,76034
3,EUG,RDM,"Eugene, OR","Bend, OR",11,72,4,103,199010,10,1990,284093,76034
4,MFR,RDM,"Medford, OR","Bend, OR",0,18,1,156,199002,2,1990,147300,76034


## Scrubbed DataFrame in place --  Begin preliminary analyses

In [10]:
# Step 01 - Perform exploratory analysis by aggregating on various attributes