# 電力データを変換する

In [48]:
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import to_timestamp, to_date
from datetime import datetime

In [71]:
data_file = 'data/power_grid_sample.csv'
randomized_file = 'data/power_grid_dummydata'

In [13]:
df1 = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(data_file)

In [15]:
df1.schema

StructType(List(StructField(DATETIME,StringType,true),StructField(Measured,IntegerType,true),StructField(Predicted,IntegerType,true),StructField(UseRate,IntegerType,true),StructField(EstimatedSupplyCapacity,IntegerType,true)))

In [45]:
df2 = df1.select(to_timestamp(df1['DATETIME'], 'yyyy/M/d H:mm').alias('timestamp'), '*')

In [60]:
@udf(IntegerType())
def randamize(col):
    diff = random.randint(-30, 30)
    return col + diff

In [61]:
df3 = df2.select(randamize(df2['Measured']).alias('RMeasured'),
                randamize(df2['Predicted']).alias('RPredicted'),
                randamize(df2['EstimatedSupplyCapacity']).alias('REstimatedSupplyCapacity'), '*')

In [65]:
df4 = df3.select(df3['RMeasured']/df3['REstimatedSupplyCapacity'].alias('RUseRate'), '*')

In [67]:
df5 = df4.select('timestamp', 'RMeasured', 'RPredicted', 'REstimatedSupplyCapacity')

In [68]:
df5.show()

+-------------------+---------+----------+------------------------+
|          timestamp|RMeasured|RPredicted|REstimatedSupplyCapacity|
+-------------------+---------+----------+------------------------+
|2021-01-05 00:00:00|     1470|      1503|                    1854|
|2021-01-05 01:00:00|     1459|      1449|                    1850|
|2021-01-05 02:00:00|     1470|      1405|                    1791|
|2021-01-05 03:00:00|     1423|      1372|                    1793|
|2021-01-05 04:00:00|     1390|      1399|                    1825|
|2021-01-05 05:00:00|     1407|      1353|                    1867|
|2021-01-05 06:00:00|     1575|      1524|                    1978|
|2021-01-05 07:00:00|     1827|      1860|                    2088|
|2021-01-05 08:00:00|     2127|      2049|                    2287|
|2021-01-05 09:00:00|     2260|      2103|                    2359|
|2021-01-05 10:00:00|     2267|      2109|                    2315|
|2021-01-05 11:00:00|     2185|      2072|      

In [72]:
df5.write.format('delta').save(randomized_file)

                                                                                