# 電力データを変換する

In [33]:
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import to_timestamp, to_date, explode
from datetime import datetime, timedelta

In [9]:
data_file = 'data/power_grid_sample.csv'
randomized_file = 'data/power_grid_dummydata'

In [10]:
df1 = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(data_file)

In [11]:
df1.schema

StructType(List(StructField(DATETIME,StringType,true),StructField(Measured,IntegerType,true),StructField(Predicted,IntegerType,true),StructField(UseRate,IntegerType,true),StructField(EstimatedSupplyCapacity,IntegerType,true)))

In [12]:
df2 = df1.select(to_timestamp(df1['DATETIME'], 'yyyy/M/d H:mm').alias('timestamp'), '*')

In [40]:
randomize_width = 30

In [49]:
@udf(IntegerType())
def randamize(col):
    diff = random.randint(-randomize_width, randomize_width)
    return col + diff

In [14]:
df3 = df2.select(randamize(df2['Measured']).alias('RMeasured'),
                randamize(df2['Predicted']).alias('RPredicted'),
                randamize(df2['EstimatedSupplyCapacity']).alias('REstimatedSupplyCapacity'), '*')

In [19]:
df4 = df3.select((df3['RMeasured']/df3['REstimatedSupplyCapacity']).alias('RUseRate'), '*')

In [20]:
df5 = df4.select('timestamp', 'RMeasured', 'RPredicted', 'RUseRate', 'REstimatedSupplyCapacity')

In [34]:
@udf(ArrayType(TimestampType()))
def transform_timestamp(timestamp):
    records = [timestamp]
    num_records = 60
    for i in range(0, num_records):
        delta = timedelta(minutes=i+1)
        records.append(timestamp + delta)
        
    return records

In [35]:
df6 = df5.select(explode(transform_timestamp(df5['timestamp'])).alias('timestamp_min'), '*')

In [36]:
df6.show()

+-------------------+-------------------+---------+----------+------------------+------------------------+
|      timestamp_min|          timestamp|RMeasured|RPredicted|          RUseRate|REstimatedSupplyCapacity|
+-------------------+-------------------+---------+----------+------------------+------------------------+
|2021-01-05 00:00:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:01:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:02:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:03:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:04:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:05:00|2021-01-05 00:00:00|     1485|      1496|0.7869634340222575|                    1887|
|2021-01-05 00:06:00|2021-01-05 00:00

In [50]:
randomize_width = 10

In [53]:
df7 = df6.select(randamize(df6['RMeasured']).alias('RRMeasured'), '*')

In [54]:
df7.show()

+----------+-------------------+-------------------+---------+----------+----------------+------------------------+
|RRMeasured|      timestamp_min|          timestamp|RMeasured|RPredicted|        RUseRate|REstimatedSupplyCapacity|
+----------+-------------------+-------------------+---------+----------+----------------+------------------------+
|      1503|2021-01-05 00:00:00|2021-01-05 00:00:00|     1493|      1526|0.80010718113612|                    1866|
|      1488|2021-01-05 00:01:00|2021-01-05 00:00:00|     1493|      1526|0.80010718113612|                    1866|
|      1500|2021-01-05 00:02:00|2021-01-05 00:00:00|     1493|      1526|0.80010718113612|                    1866|
|      1498|2021-01-05 00:03:00|2021-01-05 00:00:00|     1493|      1526|0.80010718113612|                    1866|
|      1502|2021-01-05 00:04:00|2021-01-05 00:00:00|     1493|      1526|0.80010718113612|                    1866|
|      1483|2021-01-05 00:05:00|2021-01-05 00:00:00|     1493|      1526

In [55]:
df8 = df7.select((df7['RRMeasured']/df7['REstimatedSupplyCapacity']).alias('RRUseRate'), '*')

In [56]:
df8.show()

+------------------+----------+-------------------+-------------------+---------+----------+------------------+------------------------+
|         RRUseRate|RRMeasured|      timestamp_min|          timestamp|RMeasured|RPredicted|          RUseRate|REstimatedSupplyCapacity|
+------------------+----------+-------------------+-------------------+---------+----------+------------------+------------------------+
|0.7995678011885468|      1480|2021-01-05 00:00:00|2021-01-05 00:00:00|     1487|      1496|0.8033495407887629|                    1851|
|0.8060507833603457|      1492|2021-01-05 00:01:00|2021-01-05 00:00:00|     1487|      1496|0.8033495407887629|                    1851|
|0.8001080497028633|      1481|2021-01-05 00:02:00|2021-01-05 00:00:00|     1487|      1496|0.8033495407887629|                    1851|
| 0.807131280388979|      1494|2021-01-05 00:03:00|2021-01-05 00:00:00|     1487|      1496|0.8033495407887629|                    1851|
| 0.804430037817396|      1489|2021-01-05

In [59]:
df9 = df8.select(df8['timestamp_min'].alias('timestamp'), df8['RRMeasured'].alias('Measured'), df8['RPredicted'].alias('Predicted'), df8['RRUseRate'].alias('UseRate'), df8['REstimatedSupplyCapacity'].alias('EstimatedSupplyCapacity'))

In [60]:
df9.show()

+-------------------+--------+---------+------------------+-----------------------+
|          timestamp|Measured|Predicted|           UseRate|EstimatedSupplyCapacity|
+-------------------+--------+---------+------------------+-----------------------+
|2021-01-05 00:00:00|    1488|     1488|0.7910685805422647|                   1881|
|2021-01-05 00:01:00|    1490|     1488|0.7921318447634237|                   1881|
|2021-01-05 00:02:00|    1496|     1488|0.7953216374269005|                   1881|
|2021-01-05 00:03:00|    1498|     1488|0.7963849016480595|                   1881|
|2021-01-05 00:04:00|    1495|     1488|0.7947900053163212|                   1881|
|2021-01-05 00:05:00|    1498|     1488|0.7963849016480595|                   1881|
|2021-01-05 00:06:00|    1494|     1488|0.7942583732057417|                   1881|
|2021-01-05 00:07:00|    1494|     1488|0.7942583732057417|                   1881|
|2021-01-05 00:08:00|    1504|     1488|0.7995746943115364|                 

In [63]:
df9.write.format('csv').option('header', 'true').option('sep', r'/t').save(randomized_file+'.tsv')
df9.write.format('delta').save(randomized_file+'.delta')

                                                                                