In [44]:
#EXAMPLE 1 - LOAD A CSV WITH THE MOST EFFICIENT MEMORY TYPES

import pandas as pd

in_csv = r"C:\Users\dconly\Desktop\Temporary\Replica Select Link Data\I5_AmRiver\trips_wednesday_mar2019-may2019_sacramento_1filters_created11-24-2020\trips_wednesday_mar2019-may2019_sacramento_1filters_created11-24-2020.csv"

df_normal_load = pd.read_csv(in_csv)

# dict to specify dtypes so that most efficient possible are used rather than defaults.
# NOTE that you need to know ahead of time what the dtypes needed are
column_dtypes = {#'activity_id':'int64',
    #'person_id':'int64',
    'mode':'category',
    'travel_purpose':'category',
    'previous_activity_type':'category',
    'start_local_hour':'int8',
    'end_local_hour':'int8',
    'origin_cty':'int32',
    #'origin_bgrp':'int64',
    'destination_cty':'int32'
    # 'destination_bgrp':'int64'
    }

# can load datetime cols as dates rather than strings but need to specify
# which cols you want to do this to.
datetime_cols = ['start_time', 'end_time']

df_optimized_load = pd.read_csv(in_csv, dtype=column_dtypes, parse_dates=datetime_cols) # parse_dates=datetime_cols

In [26]:
normal_load_usage = df_normal_load.memory_usage(deep=True).sum()/1024
optimized_load_usage = df_optimized_load.memory_usage(deep=True).sum()/1024

print(f"non-optimized uses {normal_load_usage} MB of memory")
print(f"optimized uses {optimized_load_usage} MB of memory")

non-optimized uses 40667.2060546875 MB of memory
optimized uses 22575.642578125 MB of memory


In [56]:
# EXAMPLE FUNCTION TO MINIMIZE A DATAFRAME MEMORY CONSUMPTION AFTER LOADING
def df_optimize(in_df):
    # common default data types that can be reduced to more memory-efficient type
    dtype_obj = 'object'
    dtypes_int = ['int64', 'uint64']
    dtypes_float = ['float64']
    
    # category dtype is far more efficient way to store strings if not many unique string values.
    dtype_category = 'category'
    
    for col in in_df.columns:
        start_dtype = in_df[col].dtype
        if start_dtype in dtypes_int:
            in_df[col] = pd.to_numeric(in_df[col], downcast='integer') # sets to biggest size necessary, not biggest size possible
        elif start_dtype in dtypes_float:
            in_df[col] = pd.to_numeric(in_df[col], downcast='float')
        elif start_dtype == dtype_obj:
            # if number of unique string vals is less than 40% of the total number of vals in column,
            # then recode as category instead of string, which will save significant memory
            if len(in_df[col].unique()) / len(in_df[col]) < 0.4:
                in_df[col] = in_df[col].astype('category')
            else:
                continue
        else:
            continue
            
df_optimize(df_optimized_load)

normal_load_usage = df_normal_load.memory_usage(deep=True).sum()/1024
optimized_load_usage = df_optimized_load.memory_usage(deep=True).sum()/1024

print(f"non-optimized uses {normal_load_usage} MB of memory")
print(f"optimized uses {optimized_load_usage} MB of memory")

non-optimized uses 40667.2060546875 MB of memory
optimized uses 14278.455078125 MB of memory
