In [None]:
import numpy as np
import pandas as pd

In [None]:
test_df = pd.read_pickle("../data/test_converted.pkl")

In [None]:
weather_test = pd.read_pickle("../data/weather_test_converted.pkl")

In [None]:
weather_test.rename(columns={'timestamp': 'timestamp_weather_utc'}, inplace = True)

In [None]:
test_df.head()

In [None]:
test_df.tail()

In [None]:
weather_test.head()

In [None]:
weather_test.head()

In [None]:
weather_test = weather_test.drop("timestamp_weather_utc", axis = 1)

In [None]:
test_large = test_df.merge(weather_test, how = "left", on = ["site_id", "timestamp_local", "timezone", "country_code", "location"])

In [None]:
def reduce_mem_usage(df, verbose=True):
    """
    Takes an dataframe as argument and adjusts the datatypes of the respective
    columns to reduce memory allocation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min and
                        c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min and
                      c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min and
                      c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min and
                      c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min and
                        c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min and
                      c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    reduced_mem = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, reduced_mem))
    return df


In [None]:
test_large = reduce_mem_usage(test_large)

In [None]:
test_large.to_pickle('../data/test_large_converted.pkl')