In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Bring all the data into a single dataframe.
Sample of the data is located in the Google Drive folder (link below). One zip file for each month for the year of 2017
https://drive.google.com/drive/folders/1lZJhPM24i9PQRW_nOTQj5sQy3FTiwhbY?usp=sharing


In [2]:
path = './Data/AirlineData/2018/'
all_files = glob.glob(os.path.join(path, "*.zip"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f, compression='zip', parse_dates=['FL_DATE']) for f in all_files)
concatenated_df = pd.concat(df_from_each_file)

In [3]:
concatenated_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7213446 entries, 0 to 520730
Data columns (total 33 columns):
YEAR                   int64
MONTH                  int64
DAY_OF_MONTH           int64
DAY_OF_WEEK            int64
FL_DATE                datetime64[ns]
OP_CARRIER             object
ORIGIN                 object
ORIGIN_CITY_NAME       object
ORIGIN_STATE_ABR       object
DEST                   object
DEST_CITY_NAME         object
DEST_STATE_ABR         object
CRS_DEP_TIME           int64
DEP_TIME               float64
DEP_DELAY              float64
DEP_DELAY_NEW          float64
TAXI_OUT               float64
WHEELS_OFF             float64
WHEELS_ON              float64
TAXI_IN                float64
CRS_ARR_TIME           int64
ARR_TIME               float64
ARR_DELAY              float64
ARR_DELAY_NEW          float64
ACTUAL_ELAPSED_TIME    float64
AIR_TIME               float64
DISTANCE               float64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
N

In [4]:
#define a function to find memory usage 
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

# Get all the int-type data

In [None]:
df_int = concatenated_df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(df_int))
print(mem_usage(converted_int))

385.24 MB
116.95 MB


# Get all the float-type data

In [None]:
df_float = concatenated_df.select_dtypes(include=['float'])
converted_float = df_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(df_float))
print(mem_usage(converted_float))

1100.68 MB
577.86 MB


# Create an optimized copy of df

In [None]:
optimized_df = concatenated_df.copy()

optimized_df[converted_int.columns] = converted_int
optimized_df[converted_float.columns] = converted_float

print(mem_usage(concatenated_df))
print(mem_usage(optimized_df))

4493.73 MB
3702.62 MB


# Get all the object-type data

In [None]:
df_obj = concatenated_df.select_dtypes(include=['object']).copy()

In [None]:
converted_obj = pd.DataFrame()

for col in df_obj.columns:
    num_unique_values = len(df_obj[col].unique())
    num_total_values = len(df_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = df_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = df_obj[col]
        
print(mem_usage(df_obj))
print(mem_usage(converted_obj))

3062.84 MB
130.85 MB


In [None]:
optimized_df[converted_obj.columns] = converted_obj
mem_usage(optimized_df)

'770.62 MB'

In [None]:
optimized_df = optimized_df.drop(['Unnamed: 32'], axis=1)

In [None]:
optimized_df.to_csv('./Data/2018.csv', index = False, encoding = 'utf-8')