# Notebook to help figure out optimal column types
Choose column types from:
* https://numpy.org/devdocs/user/basics.types.html
* https://numpy.org/devdocs/reference/arrays.datetime.html

In [1]:
import numpy as np
import pandas as pd

In [2]:
def reduce_mem_usage(df, verbose=True):
    """
    Takes an dataframe as argument and adjusts the datatypes of the respective
    columns to reduce memory allocation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min and
                        c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min and
                      c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min and
                      c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min and
                      c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min and
                        c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min and
                      c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    reduced_mem = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, reduced_mem))
    return df

In [3]:
def print_min_max_column(df, column):
    print(column, "min:", df[column].min(), "max:", df[column].max())
    
def print_min_max(df):
    for column in df.columns:
        print_min_max_column(df, column)

In [4]:
data_dir = "../data"

In [5]:
train_df = pd.read_csv(data_dir + "/raw/train.csv")
print_min_max(train_df)
train_df = reduce_mem_usage(train_df)
print(train_df.dtypes)

building_id min: 0 max: 1448
meter min: 0 max: 3
timestamp min: 2016-01-01 00:00:00 max: 2016-12-31 23:00:00
meter_reading min: 0.0 max: 21904700.0
Mem. usage decreased to 289.19 Mb (53.1% reduction)
building_id        int16
meter               int8
timestamp         object
meter_reading    float32
dtype: object


In [6]:
test_df = pd.read_csv(data_dir + "/raw/test.csv")
print_min_max(test_df)
train_df = reduce_mem_usage(test_df)
print(test_df.dtypes)

row_id min: 0 max: 41697599
building_id min: 0 max: 1448
meter min: 0 max: 3
timestamp min: 2017-01-01 00:00:00 max: 2018-12-31 23:00:00
Mem. usage decreased to 596.49 Mb (53.1% reduction)
row_id          int32
building_id     int16
meter            int8
timestamp      object
dtype: object


In [7]:
weather_train_df = pd.read_csv(data_dir + "/raw/weather_train.csv")
print_min_max(weather_train_df)
weather_train_df = reduce_mem_usage(weather_train_df)
print(weather_train_df.dtypes)

site_id min: 0 max: 15
timestamp min: 2016-01-01 00:00:00 max: 2016-12-31 23:00:00
air_temperature min: -28.9 max: 47.2
cloud_coverage min: 0.0 max: 9.0
dew_temperature min: -35.0 max: 26.1
precip_depth_1_hr min: -1.0 max: 343.0
sea_level_pressure min: 968.2 max: 1045.5
wind_direction min: 0.0 max: 360.0
wind_speed min: 0.0 max: 19.0
Mem. usage decreased to  3.07 Mb (68.1% reduction)
site_id                  int8
timestamp              object
air_temperature       float16
cloud_coverage        float16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtype: object


In [8]:
weather_test_df = pd.read_csv(data_dir + "/raw/weather_test.csv")
print_min_max(weather_test_df)
weather_test_df = reduce_mem_usage(weather_test_df)
print(weather_test_df.dtypes)

site_id min: 0 max: 15
timestamp min: 2017-01-01 00:00:00 max: 2018-12-31 23:00:00
air_temperature min: -28.1 max: 48.3
cloud_coverage min: 0.0 max: 9.0
dew_temperature min: -31.6 max: 26.7
precip_depth_1_hr min: -1.0 max: 597.0
sea_level_pressure min: 972.0 max: 1050.1
wind_direction min: 0.0 max: 360.0
wind_speed min: 0.0 max: 24.2
Mem. usage decreased to  6.08 Mb (68.1% reduction)
site_id                  int8
timestamp              object
air_temperature       float16
cloud_coverage        float16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
dtype: object


In [9]:
building_metadata_df = pd.read_csv(data_dir + "/raw/building_metadata.csv")
print_min_max(building_metadata_df)
building_metadata_df = reduce_mem_usage(building_metadata_df)
print(building_metadata_df.dtypes)

site_id min: 0 max: 15
building_id min: 0 max: 1448
primary_use min: Education max: Warehouse/storage
square_feet min: 283 max: 875000
year_built min: 1900.0 max: 2017.0
floor_count min: 1.0 max: 26.0
Mem. usage decreased to  0.03 Mb (60.3% reduction)
site_id           int8
building_id      int16
primary_use     object
square_feet      int32
year_built     float16
floor_count    float16
dtype: object
