In [1]:
import datapackage
import yaml
import glob
import json
import os
import pprint as pp

In [2]:
# Initialize Data Package with some basic metadata

dp = datapackage.DataPackage(schema='tabular')
dp.descriptor['name'] = 'refit-electrical-load-measurements'
dp.descriptor['title'] = 'REFIT: Electrical Load Measurements'
dp.descriptor['description'] = 'Collection of this dataset was supported by the Engineering and Physical Sciences Research Council (EPSRC) via the project entitled Personalised Retrofit Decision Support Tools for UK Homes using Smart Home Technology (REFIT), which is a collaboration among the Universities of Strathclyde, Loughborough and East Anglia. The dataset includes data from 20 households from the Loughborough area over the period 2013 - 2015. Additional information about REFIT is available from www.refitsmarthomes.org.'



In [3]:

!head data/House_1.csv

Time,Unix,Aggregate,Appliance1,Appliance2,Appliance3,Appliance4,Appliance5,Appliance6,Appliance7,Appliance8,Appliance9
2013-10-09 13:06:17,1381323977,523,74,0,69,0,0,0,0,0,1
2013-10-09 13:06:31,1381323991,526,75,0,69,0,0,0,0,0,1
2013-10-09 13:06:46,1381324006,540,74,0,68,0,0,0,0,0,1
2013-10-09 13:07:01,1381324021,532,74,0,68,0,0,0,0,0,1
2013-10-09 13:07:15,1381324035,540,74,0,69,0,0,0,0,0,1
2013-10-09 13:07:18,1381324038,539,74,0,69,0,0,0,0,0,1
2013-10-09 13:07:30,1381324050,537,74,0,69,0,0,0,0,0,1
2013-10-09 13:07:32,1381324052,537,74,0,69,0,0,0,0,0,1
2013-10-09 13:07:44,1381324064,548,74,0,69,0,0,0,0,0,1


In [4]:
# Fields common to all House_*.csv files

!cat schema.yml

- name: Time
  type: datetime
  format: "fmt:%Y-%m-%d %H:%M:%S"
- name: Unix
  type: integer
- name: Aggregate
  type: integer
- name: Appliance1
  type: integer
- name: Appliance2
  type: integer
- name: Appliance3
  type: integer
- name: Appliance4
  type: integer
- name: Appliance5
  type: integer
- name: Appliance6
  type: integer
- name: Appliance7
  type: integer
- name: Appliance8
  type: integer
- name: Appliance9
  type: integer

In [5]:
# Extra fields to merge with the common fields

!head extra_fields.yml

House_1.csv:
  descriptions:
    - Aggregate
    - Fridge
    - Chest Freezer
    - Upright Freezer
    - Tumble Dryer
    - Washing Machine
    - Dishwasher
    - Computer Site


In [6]:
def descriptions(file):
    with open('extra_fields.yml', 'r') as f:
        extra_fields = yaml.load(f)
    for d in ['DateTime'] + ['UnixTime'] + extra_fields[os.path.basename(file)]['descriptions']:
        yield {'description': d}

with open('schema.yml', 'r') as f:
    schema = yaml.load(f)

files = glob.glob('data/*.csv')
dp.descriptor['resources'] = [{'path': file, 
                               'schema': {'fields': list({**x[0],**x[1]} for x in zip(schema,descriptions(file)))},
                               'name': os.path.basename(file).lower()} for file in files]
                               

In [7]:
pp.pprint(dp.descriptor)

{'description': 'Collection of this dataset was supported by the Engineering '
                'and Physical Sciences Research Council (EPSRC) via the '
                'project entitled Personalised Retrofit Decision Support Tools '
                'for UK Homes using Smart Home Technology (REFIT), which is a '
                'collaboration among the Universities of Strathclyde, '
                'Loughborough and East Anglia. The dataset includes data from '
                '20 households from the Loughborough area over the period 2013 '
                '- 2015. Additional information about REFIT is available from '
                'www.refitsmarthomes.org.',
 'name': 'refit-electrical-load-measurements',
 'resources': [{'name': 'house_1.csv',
                'path': 'data/House_1.csv',
                'schema': {'fields': [{'description': 'DateTime',
                                       'format': 'fmt:%Y-%m-%d %H:%M:%S',
                                       'name': 'Time',
    

In [8]:
# Validate the Data Package is formatted correctly w/r/t to the specs.

dp.validate()

In [9]:
# Write the datapackage.json to the current directory

with open('datapackage.json', 'w') as f:
  f.write(json.dumps(dp.descriptor, indent=2))