In [18]:
from os.path import join as pjoin
import numpy as np
import pandas as pd
import datetime
# from pytz import timezone
# import pytz
import pendulum

In [2]:
input_filepath = "../data/raw"
train_df = pd.read_csv(input_filepath + "/train.csv")
building_df = pd.read_csv(input_filepath + "/building_metadata.csv")
train_df = train_df.merge(building_df, on = "building_id", how = "left")

In [3]:
site_info = pd.read_csv('../data/site_info.csv', delimiter = ";")
site_info

Unnamed: 0,site_id,timezone,country_code,location
0,0,US/Eastern,US,"Orlando, FL"
1,1,Europe/London,UK,"UK, Southhampton"
2,2,US/Mountain,US,"Tempe, AZ"
3,3,US/Eastern,US,"Washington, WA"
4,4,US/Pacific,US,"San Francisco, CA"
5,5,Europe/London,UK,"UK, London"
6,6,US/Eastern,US,Philadelphia
7,7,Canada/Eastern,CA,Montreal/Ottawa
8,8,US/Eastern,US,"Orlando, FL"
9,9,US/Central,US,"Austin, TX"


In [4]:
site_info.timezone = site_info.timezone.map(lambda x: pendulum.timezone(x))

In [5]:
site_info.timezone[1]

Timezone('Europe/London')

In [6]:
train_df = train_df.merge(site_info, on = "site_id", how = "left")

In [7]:
def reduce_mem_usage(df, verbose=True):
    """
    Takes an dataframe as argument and adjusts the datatypes of the respective
    columns to reduce memory allocation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min and
                        c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min and
                      c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min and
                      c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min and
                      c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min and
                        c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min and
                      c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    reduced_mem = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, reduced_mem))
    return df


In [8]:
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 1233.89 Mb (38.5% reduction)


In [9]:
def adjust_column_types(data_frame):
    """
    Takes a data frame and parses certain columns to the desired type.
    """
    data_frame["timestamp"] = pd.to_datetime(data_frame["timestamp"])
    return data_frame

In [10]:
#train_df = adjust_column_types(train_df)

In [24]:
type(train_df['timestamp'][1])

str

In [12]:
ts =  train_df['timestamp']

In [23]:
ts_local = ts.map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [25]:
ts_local[0]

Timestamp('2016-01-01 00:00:00')

In [28]:
train_df.timezone[1].convert(ts_local[1])

TypeError: __new__() got an unexpected keyword argument 'fold'

In [None]:
train_df.loc[:,'ts'] = ts_local

In [45]:
train_df.loc[:,'ts'] = ts.map(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [19]:
datetime.datetime.strptime("2016-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

datetime.datetime(2016, 1, 1, 0, 0)

In [39]:
tz = train_df.timezone[1]

In [41]:
ts = train_df['ts'][1]

In [43]:
tz.convert(datetime.datetime.strptime("2016-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"))

datetime.datetime(2016, 1, 1, 0, 0, tzinfo=Timezone('US/Eastern'))