In [1]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
from datetime import datetime 
from pytz import timezone
import pytz

In [2]:
input_filepath = "../data/raw"
test_df = pd.read_csv(input_filepath + "/test.csv")
building_df = pd.read_csv(input_filepath + "/building_metadata.csv")
test_df = test_df.merge(building_df, on = "building_id", how = "left")
site_info = pd.read_csv('../data/site_info.csv', delimiter = ";")

In [3]:
site_info = pd.read_csv('../data/site_info.csv', delimiter = ";")

In [4]:
# load and concatenate weather data
weather_dtypes = {
    'site_id': np.uint8,
    'air_temperature': np.float32,
    'cloud_coverage': np.float32,
    'dew_temperature': np.float32,
    'precip_depth_1_hr': np.float32,
    'sea_level_pressure': np.float32,
    'wind_direction': np.float32,
    'wind_speed': np.float32,
}

In [5]:
weather_test = pd.read_csv(
    pjoin(input_filepath, 'weather_test.csv'),
    dtype=weather_dtypes,
    parse_dates=['timestamp']
)

In [6]:
# convert the string column into a timezone object
site_info.timezone = site_info.timezone.map(lambda x: pytz.timezone(x))

In [7]:
weather_test = weather_test.merge(site_info, on = "site_id", how = "left")

In [8]:
# time object
utc = pytz.utc

In [9]:
# bring all timestamps into utc timeformat
weather_test['timestamp'] = weather_test.timestamp.apply(lambda x: utc.localize(x))

In [10]:
# convert timestamps to local timezones according to site info
weather_test['timestamp_local'] = weather_test.apply(lambda x: x.timestamp.astimezone(x.timezone), axis = 1)

In [11]:
weather_test.to_pickle("../data/weather_test_converted.pkl")

In [12]:
test_df = test_df.merge(site_info, on = "site_id", how = "left")

In [13]:
def reduce_mem_usage(df, verbose=True):
    """
    Takes an dataframe as argument and adjusts the datatypes of the respective
    columns to reduce memory allocation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min and
                        c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min and
                      c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min and
                      c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min and
                      c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min and
                        c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min and
                      c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    reduced_mem = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, reduced_mem))
    return df


In [14]:
test_df = reduce_mem_usage(test_df)

Mem. usage decreased to 2545.02 Mb (38.5% reduction)


In [15]:
def adjust_column_types(data_frame):
    """
    Takes a data frame and parses certain columns to the desired type.
    """
    data_frame["timestamp"] = pd.to_datetime(data_frame["timestamp"])
    return data_frame

In [16]:
test_df = adjust_column_types(test_df)

In [17]:
ts = test_df['timestamp']

In [18]:
tz = test_df['timezone']

In [None]:
ts_local = [a.localize(b) for a, b in zip(tz, ts)]

In [None]:
test_df.loc[:,'timestamp_local'] = ts_local

In [None]:
test_df.to_pickle("../data/test_converted.pkl")