In [1]:
import pandas as pd
import numpy as np
import glob
import os

Bring all the data into a single dataframe.
Sample of the data is located in the Google Drive folder (link below). One zip file for each month for the year of 2017
https://drive.google.com/drive/folders/1lZJhPM24i9PQRW_nOTQj5sQy3FTiwhbY?usp=sharing


In [2]:
path = './Data/'
all_files = glob.glob(os.path.join(path, "*_2018_*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file)

  """
  """
  """


In [3]:
concatenated_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7213446 entries, 0 to 611986
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(71), int64(20), object(19)
memory usage: 12.0 GB


In [4]:
#define a function to find memory usage 
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

# Get all the int-type data

In [5]:
df_int = concatenated_df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(df_int))
print(mem_usage(converted_int))

1155.72 MB
295.81 MB


# Get all the float-type data

In [6]:
df_float = concatenated_df.select_dtypes(include=['float'])
converted_float = df_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(df_float))
print(mem_usage(converted_float))

3962.46 MB
2008.75 MB


# Create an optimized copy of df

In [7]:
optimized_df = concatenated_df.copy()

optimized_df[converted_int.columns] = converted_int
optimized_df[converted_float.columns] = converted_float

print(mem_usage(concatenated_df))
print(mem_usage(optimized_df))

12281.19 MB
9467.56 MB


# Get all the object-type data

In [8]:
df_obj = concatenated_df.select_dtypes(include=['object']).copy()

In [9]:
converted_obj = pd.DataFrame()

for col in df_obj.columns:
    num_unique_values = len(df_obj[col].unique())
    num_total_values = len(df_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = df_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = df_obj[col]
        
print(mem_usage(df_obj))
print(mem_usage(converted_obj))

7273.07 MB
241.95 MB


In [10]:
optimized_df[converted_obj.columns] = converted_obj
mem_usage(optimized_df)

'2436.44 MB'

In [11]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 100)
pd.set_option('display.max_info_columns', 200)

In [12]:
optimized_df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2018,4,10,14,7,2018-10-14,AA,19805,AA,N925UY,1674,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,1123,1127.0,4.0,4.0,0.0,0.0,1100-1159,26.0,1153.0,1845.0,9.0,1910,1854.0,-16.0,0.0,0.0,-2.0,1900-1959,0.0,,0.0,287.0,267.0,232.0,1.0,2125.0,9,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018,4,10,15,1,2018-10-15,AA,19805,AA,N583UW,1674,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,1123,1119.0,-4.0,0.0,0.0,-1.0,1100-1159,23.0,1142.0,1837.0,12.0,1910,1849.0,-21.0,0.0,0.0,-2.0,1900-1959,0.0,,0.0,287.0,270.0,235.0,1.0,2125.0,9,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2018,4,10,16,2,2018-10-16,AA,19805,AA,N581UW,1674,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,1123,1119.0,-4.0,0.0,0.0,-1.0,1100-1159,20.0,1139.0,1847.0,9.0,1910,1856.0,-14.0,0.0,0.0,-1.0,1900-1959,0.0,,0.0,287.0,277.0,248.0,1.0,2125.0,9,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2018,4,10,17,3,2018-10-17,AA,19805,AA,N551UW,1674,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,1123,1120.0,-3.0,0.0,0.0,-1.0,1100-1159,20.0,1140.0,1856.0,6.0,1910,1902.0,-8.0,0.0,0.0,-1.0,1900-1959,0.0,,0.0,287.0,282.0,256.0,1.0,2125.0,9,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2018,4,10,18,4,2018-10-18,AA,19805,AA,N181UW,1674,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91,11057,1105703,31057,CLT,"Charlotte, NC",NC,37,North Carolina,36,1123,1125.0,2.0,2.0,0.0,0.0,1100-1159,21.0,1146.0,1855.0,11.0,1910,1906.0,-4.0,0.0,0.0,-1.0,1900-1959,0.0,,0.0,287.0,281.0,249.0,1.0,2125.0,9,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
optimized_df = optimized_df.drop([
 'DOT_ID_Reporting_Airline',
 'IATA_CODE_Reporting_Airline',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'DepDelayMinutes',
 'DepDel15',
 'ArrDelayMinutes',
 'ArrDel15',
 'ArrivalDelayGroups',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'FirstDepTime',
 'TotalAddGTime',
 'LongestAddGTime',
 'DivAirportLandings',
 'DivReachedDest',
 'DivActualElapsedTime',
 'DivArrDelay',
 'DivDistance',
 'Div1Airport',
 'Div1AirportID',
 'Div1AirportSeqID',
 'Div1WheelsOn',
 'Div1TotalGTime',
 'Div1LongestGTime',
 'Div1WheelsOff',
 'Div1TailNum',
 'Div2Airport',
 'Div2AirportID',
 'Div2AirportSeqID',
 'Div2WheelsOn',
 'Div2TotalGTime',
 'Div2LongestGTime',
 'Div2WheelsOff',
 'Div2TailNum',
 'Div3Airport',
 'Div3AirportID',
 'Div3AirportSeqID',
 'Div3WheelsOn',
 'Div3TotalGTime',
 'Div3LongestGTime',
 'Div3WheelsOff',
 'Div3TailNum',
 'Div4Airport',
 'Div4AirportID',
 'Div4AirportSeqID',
 'Div4WheelsOn',
 'Div4TotalGTime',
 'Div4LongestGTime',
 'Div4WheelsOff',
 'Div4TailNum',
 'Div5Airport',
 'Div5AirportID',
 'Div5AirportSeqID',
 'Div5WheelsOn',
 'Div5TotalGTime',
 'Div5LongestGTime',
 'Div5WheelsOff',
 'Div5TailNum',
 'Unnamed: 109'], axis=1)

In [14]:
optimized_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7213446 entries, 0 to 611986
Data columns (total 42 columns):
Year                               uint16
Quarter                            uint8
Month                              uint8
DayofMonth                         uint8
DayOfWeek                          uint8
FlightDate                         category
Reporting_Airline                  category
Tail_Number                        category
Flight_Number_Reporting_Airline    uint16
OriginAirportID                    uint16
OriginAirportSeqID                 uint32
OriginCityMarketID                 uint16
Origin                             category
OriginCityName                     category
OriginState                        category
Dest                               category
DestCityName                       category
DestState                          category
CRSDepTime                         uint16
DepTime                            float32
DepDelay                         

In [15]:
optimized_df.to_csv('./Data/2018ALL.csv', index = False, encoding = 'utf-8')