In [0]:
dbutils.fs.ls("/FileStore/tables")

In [0]:
dbutils.fs.mkdirs("/FileStore/tables/small/")

In [0]:
dbutils.fs.mkdirs("/FileStore/tables/small/")

In [0]:
%fs cp /tmp/test_dbfs.txt /tmp/file_b.txt

In [0]:
from pyspark.sql.types import StringType, StructType, StructField, FloatType, TimestampType

In [0]:
df_schema = StructType([StructField('LCLid', StringType(), True),
                       StructField('tstp', TimestampType(), True),
                       StructField('energy(kWh/hh)', FloatType(), True),])

In [0]:
df = spark.read.csv("dbfs:/FileStore/tables/multicsvfile", header = True, schema = df_schema)

In [0]:
display(df.limit(5))

LCLid,tstp,energy(kWh/hh)
MAC000048,2011-12-08T12:30:00.000+0000,0.229
MAC000048,2011-12-08T13:00:00.000+0000,0.213
MAC000048,2011-12-08T13:30:00.000+0000,0.272
MAC000048,2011-12-08T14:00:00.000+0000,0.576
MAC000048,2011-12-08T14:30:00.000+0000,0.194


In [0]:
df_demo = spark.read.csv("/FileStore/tables/demographic_info/informations_households.csv", header = True, inferSchema = True)

In [0]:
display(df_demo.limit(5))

LCLid,stdorToU,Acorn,Acorn_grouped,file
MAC005492,ToU,ACORN-,ACORN-,block_0
MAC001074,ToU,ACORN-,ACORN-,block_0
MAC000002,Std,ACORN-A,Affluent,block_0
MAC003613,Std,ACORN-A,Affluent,block_0
MAC003597,Std,ACORN-A,Affluent,block_0


In [0]:
df_acorn = spark.read.csv("/FileStore/tables/demographic_info/acorn_details.csv", header = True, inferSchema = True)

In [0]:
display(df_acorn.limit(5))

MAIN CATEGORIES,CATEGORIES,REFERENCE,ACORN-A,ACORN-B,ACORN-C,ACORN-D,ACORN-E,ACORN-F,ACORN-G,ACORN-H,ACORN-I,ACORN-J,ACORN-K,ACORN-L,ACORN-M,ACORN-N,ACORN-O,ACORN-P,ACORN-Q
POPULATION,Age,Age 0-4,77.0,83.0,72.0,100.0,120.0,77.0,97.0,97.0,63.0,119.0,67.0,114.0,113.0,89.0,123.0,138.0,133.0
POPULATION,Age,Age 5-17,117.0,109.0,87.0,69.0,94.0,95.0,102.0,106.0,67.0,95.0,64.0,108.0,116.0,86.0,89.0,136.0,106.0
POPULATION,Age,Age 18-24,64.0,73.0,67.0,107.0,100.0,71.0,83.0,89.0,62.0,104.0,459.0,97.0,96.0,86.0,117.0,109.0,110.0
POPULATION,Age,Age 25-34,52.0,63.0,62.0,197.0,151.0,66.0,90.0,88.0,63.0,132.0,145.0,109.0,96.0,90.0,140.0,120.0,120.0
POPULATION,Age,Age 35-49,102.0,105.0,91.0,124.0,118.0,93.0,102.0,103.0,76.0,111.0,67.0,99.0,98.0,90.0,102.0,103.0,100.0


In [0]:
df_join = df.join(df_demo, 'LCLid')

In [0]:
display(df_join.limit(5))

LCLid,tstp,energy(kWh/hh),stdorToU,Acorn,Acorn_grouped,file
MAC000048,2011-12-08T12:30:00.000+0000,0.229,ToU,ACORN-E,Affluent,block_20
MAC000048,2011-12-08T13:00:00.000+0000,0.213,ToU,ACORN-E,Affluent,block_20
MAC000048,2011-12-08T13:30:00.000+0000,0.272,ToU,ACORN-E,Affluent,block_20
MAC000048,2011-12-08T14:00:00.000+0000,0.576,ToU,ACORN-E,Affluent,block_20
MAC000048,2011-12-08T14:30:00.000+0000,0.194,ToU,ACORN-E,Affluent,block_20


In [0]:
df_avg_group = df_join.select('LCLid','tstp', 'Acorn', 'file', 'energy(kWh/hh)').groupBy('Acorn').avg('energy(kWh/hh)')

In [0]:
display(df_avg_group.limit(20))

Acorn,avg(energy(kWh/hh))
ACORN-E,0.2165012154761052
ACORN-F,0.1922656915408575
ACORN-Q,0.1608302613311604
ACORN-P,0.1383498876394149
ACORN-D,0.2839873804665139
ACORN-K,0.2094371546716786
ACORN-H,0.2302132806846571
ACORN-J,0.2374668036828509
ACORN-N,0.1929419959531186
ACORN-A,0.3989048712841816


In [0]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, date_format

In [0]:
df_time = df_join.select(df_join.LCLid, df_join.tstp, df_join.Acorn, df_join.file, unix_timestamp(df_join.tstp, 'm/d/yyyy h:m:ss a').alias('ut'))\
  .select(df_join.LCLid, df_join.tstp, df_join.Acorn, df_join.file, from_unixtime('ut').alias('dty'))\
  .select(df_join.LCLid, df_join.tstp, df_join.Acorn, df_join.file, date_format('dty', 'd/M/yyyy').alias('Date'),
          date_format('dty', 'h:m:s a').alias('Time'))

In [0]:
display(df_time.limit(20))

LCLid,tstp,Acorn,file,Date,Time
MAC000048,2011-12-08T12:30:00.000+0000,ACORN-E,block_20,8/12/2011,12:30:0 PM
MAC000048,2011-12-08T13:00:00.000+0000,ACORN-E,block_20,8/12/2011,1:0:0 PM
MAC000048,2011-12-08T13:30:00.000+0000,ACORN-E,block_20,8/12/2011,1:30:0 PM
MAC000048,2011-12-08T14:00:00.000+0000,ACORN-E,block_20,8/12/2011,2:0:0 PM
MAC000048,2011-12-08T14:30:00.000+0000,ACORN-E,block_20,8/12/2011,2:30:0 PM
MAC000048,2011-12-08T15:00:00.000+0000,ACORN-E,block_20,8/12/2011,3:0:0 PM
MAC000048,2011-12-08T15:30:00.000+0000,ACORN-E,block_20,8/12/2011,3:30:0 PM
MAC000048,2011-12-08T16:00:00.000+0000,ACORN-E,block_20,8/12/2011,4:0:0 PM
MAC000048,2011-12-08T16:30:00.000+0000,ACORN-E,block_20,8/12/2011,4:30:0 PM
MAC000048,2011-12-08T17:00:00.000+0000,ACORN-E,block_20,8/12/2011,5:0:0 PM


In [0]:
df_energy = df.select('tstp', 'energy(kWh/hh)')

In [0]:
df_datetime = df_time.join(df_energy, 'tstp').drop('tstp')

In [0]:
display(df_datetime.limit(20))

LCLid,Acorn,file,Date,Time,energy(kWh/hh)
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.225
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.049
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.066
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.19
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.174
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.29
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.054
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.474
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.39
MAC000145,ACORN-F,block_47,24/11/2011,11:0:0 AM,0.286


In [0]:
df_datetime.count()

In [0]:
df_sum_group = df_datetime.select('LCLid', 'Date', 'Acorn', 'file', 'energy(kWh/hh)').groupBy('Date', 'Acorn').sum('energy(kWh/hh)')

In [0]:
display(df_sum_group.limit(20))