### Author: Chheang
### Title: Data Load and Manipulation Optimization
### Goal: Look at how we can reduce processing time and memory

In [23]:
print("hello world")

hello world


In [24]:
import pandas as pd 
import numpy as np

import time
import tracemalloc

# define paths to data
combinedFlights2018CSV = "archive/Combined_Flights_2018.csv"
combinedFlights2018Parquet = "archive/Combined_Flights_2018.parquet"

tracemalloc.start()


### Loading the data

In [25]:
# #time for full load of csv

# tic = time.process_time()
# full_load = pd.read_csv(combinedFlights2018CSV)
# toc = time.process_time()
# print("Full Load CSV Process Time:", toc-tic)

# #loading only the columns you want is faster and memory efficient

columns_to_use = [
    'Airline',
    'Origin',
    'Dest',
    'CRSDepTime', 
    'Distance', 
    'Year', 
    'Quarter', 
    'Month', 
    'DayofMonth', 
    'DayOfWeek', 
    'DepTimeBlk', 
    'ArrTimeBlk', 
    'DistanceGroup',
    'ArrDelayMinutes'
    ]
# #time for load of csv with select columns

# tic = time.process_time()
# select_load = pd.read_csv(combinedFlights2018CSV, usecols = columns_to_use)
# toc = time.process_time()
# print("Select Columns CSV Process Time:", toc-tic)

# #time for load of csv with select columns running the PyArrow engine
# #pip install pyarrow

# tic = time.process_time()
# select_load = pd.read_csv(combinedFlights2018CSV, usecols = columns_to_use, engine="pyarrow")
# toc = time.process_time()
# print("PyArrow Select Columns CSV Process Time:", toc-tic)

# #time for load of parquet with select columns

# tic = time.process_time()
# select_load = pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use)
# toc = time.process_time()
# print("Select Columns Parquet Process Time:", toc-tic)

# #time for load of parquet with select columns running the PyArrow engine
# #pip install pyarrow

# tic = time.process_time()
# select_load = pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use, engine="pyarrow")
# toc = time.process_time()
# print("PyArrow Select Columns Parquet Process Time:", toc-tic)

# #time for load of parquet with select columns running the FastParquet engine
# #pip install fastparquet

tic = time.process_time()
data = pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use, engine="fastparquet")
toc = time.process_time()
print("FastParquet Select Columns Parquet Process Time:", toc-tic)


FastParquet Select Columns Parquet Process Time: 1.3014710000000065


### Data Manipulation Techniques

In [26]:
# note: do not use dataframe.info with tracemalloc
# data.info(memory_usage = "deep")

#you are able to perform inplace operations so that the data is not duplicated
#in pandas documentation check for parameter:
# inplacebool, default False
# If False, return a copy. Otherwise, do operation inplace and return None.
data.dropna(subset=['ArrDelayMinutes'], inplace = True)

print(tracemalloc.get_traced_memory())

## 
data_list = list()
for i in range(3):
    #in this scenario, data_temp will be recycled after the loop is completed
    data_temp = pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use, engine="fastparquet")
    data_list.append(data_temp)
    print(tracemalloc.get_traced_memory())
data_temp = pd.DataFrame() #note: this does not improve memory
print(tracemalloc.get_traced_memory()) #note: notice that this is the same memory as the last iteration



combined_data = pd.concat(data_list, ignore_index=True)
# combined_data.info(memory_usage = "deep")
print(tracemalloc.get_traced_memory())
data_list = list() #free the memory
print(tracemalloc.get_traced_memory())
data = pd.DataFrame() #free the memory from the first test
print(tracemalloc.get_traced_memory())

(670537088, 1449183705)
(1353334137, 1449183705)
(2036123166, 2074221769)
(2718912064, 2757010835)
(2718913292, 2757010835)
(4630592875, 4767150737)
(2582354700, 4767150737)
(1911914958, 4767150737)


In [27]:
tracemalloc.stop()