In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import dateutil.parser

In [2]:
elec_path = '../data/processed/temp_open_utc_complete.csv'

In [3]:
meta_path = '../data/raw/meta_open.csv'

In [4]:
weather_path = '../data/external/weather/weather1.csv'

In [5]:
def make_dataset(elec_path, meta_path, weather_path, weather_file, industry):
    '''
    INPUT
    elec_path: path to timeseries data of electricity meter (temp_open_utc_complete.csv)
    meta: path to meta data table (meta_open.csv)
    weather: path to weather data table (weatherX.csv)
    weather_file: name of the weather file
    industry: name of undustry to focus on
    OUTPUT
    A dataframe in which each record represents a building at a certain time
    columns =[
    building_name: name of the building from meta, str
    month: one-hot coded
    day: from elec, int
    day_of_the_week: one-hot coded
    hour: from elec, hour from weather is converted to the nearest :00, int
    area: from meta, float
    primary_space_usage: from meta (primaryspaceuse_abbrev), one-hot coded
    electricity: from elec
    temperature: from weather
    ]
    ------------------------------------------------------------------------------------------------------------------------------------------
    comment:
    -humidity is sometimes missing in weather table
    '''
    #read tables
    elec = pd.read_csv(elec_path)
    meta = pd.read_csv(meta_path)
    weather = pd.read_csv(weather_path)
    #set 'uid' as index in meta
    meta = meta.set_index('uid')
    #parse date
    weather['timestamp'] = weather['timestamp'].apply(dateutil.parser.parse)
    elec['timestamp'] = elec['timestamp'].apply(dateutil.parser.parse)
    #construct the dataframe to return
    buildings = list(meta[(meta['newweatherfilename'] == 'weather1.csv') & (meta['industry']=='Education')].index) #name of the buildings
    df = pd.DataFrame(columns={'building_name', 'timestamp', 'electricity', 'area', 'primary_space_usage'}) #empty dataframe with 3 columns
    for building in buildings:
        subdf = elec[['timestamp', building]]
        subdf.columns = ['timestamp', 'electricity']
        subdf['building_name'] = building
        subdf['area'] = meta.loc[building, 'sqm']
        subdf['primary_space_usage'] = meta.loc[building, 'primaryspaceuse_abbrev']
        df = pd.concat([df, subdf], axis=0, ignore_index=True)
    #df has 'building_name', timestamp, electricity meter, area, primary space usage
    print('OK1')
    weather['rounded_timestamp'] = weather['timestamp'].apply(cutoff_minute) #cutoff_minute is implemented separately
    weather = weather.groupby('rounded_timestamp').first() #only the first observation in each hour is taken
    weather = weather['TemperatureC'] #only need temperature column
    print('OK2')
    df['timestamp'] = df['timestamp'].apply(cutoff_minute)
    df = df.join(weather, on='timestamp', how='inner', lsuffix='elec', rsuffix='weather') #join temperature data from weather table
    #df = add_month_day_hour(df) #to do
    #df = add_day_of_the_week()
    return df.reset_index()

In [6]:
def cutoff_minute(dt):
    '''
    INPUT
    a datetime object has year, month, day, hour, and minute
    OUTPUT
    a datetime object has year, month, day, and hour
    '''
    year = dt.year
    month = dt.month
    day = dt.day
    hour = dt.hour
    return datetime.datetime(year, month, day, hour)

In [7]:
make_dataset(elec_path, meta_path, weather_path, 'weather1.csv', 'Education')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




OK1
OK2


Unnamed: 0,index,area,building_name,electricity,primary_space_usage,timestamp,TemperatureC
0,31429,2927.0,PrimClass_Jolie,0.700000,PrimClass,2014-12-01 00:00:00,7.0
1,72369,4373.0,PrimClass_Jaylin,1.100000,PrimClass,2014-12-01 00:00:00,7.0
2,113309,2132.0,PrimClass_Jayla,2.000000,PrimClass,2014-12-01 00:00:00,7.0
3,154249,2937.0,PrimClass_Janiya,2.300000,PrimClass,2014-12-01 00:00:00,7.0
4,195189,2152.0,PrimClass_Janice,1.400000,PrimClass,2014-12-01 00:00:00,7.0
5,236129,2572.0,PrimClass_Jaden,2.763000,PrimClass,2014-12-01 00:00:00,7.0
6,277069,2154.0,PrimClass_Jermaine,2.200000,PrimClass,2014-12-01 00:00:00,7.0
7,318009,3984.0,PrimClass_Josephine,4.700000,PrimClass,2014-12-01 00:00:00,7.0
8,358949,1877.0,PrimClass_Javier,2.900000,PrimClass,2014-12-01 00:00:00,7.0
9,399889,2939.0,PrimClass_Jaylinn,3.900000,PrimClass,2014-12-01 00:00:00,7.0
