# 電力データを変換する

In [48]:
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import to_timestamp, to_date
from datetime import datetime

In [71]:
data_file = 'data/power_grid_sample.csv'
randomized_file = 'data/power_grid_dummydata'

In [13]:
df1 = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(data_file)

In [14]:
df1.show()

+--------------+--------+---------+-------+-----------------------+
|      DATETIME|Measured|Predicted|UseRate|EstimatedSupplyCapacity|
+--------------+--------+---------+-------+-----------------------+
| 2021/1/5 0:00|    1495|     1510|     80|                   1864|
| 2021/1/5 1:00|    1446|     1440|     78|                   1832|
| 2021/1/5 2:00|    1450|     1410|     79|                   1817|
| 2021/1/5 3:00|    1440|     1400|     79|                   1820|
| 2021/1/5 4:00|    1409|     1370|     76|                   1848|
| 2021/1/5 5:00|    1419|     1360|     75|                   1884|
| 2021/1/5 6:00|    1577|     1540|     80|                   1958|
| 2021/1/5 7:00|    1822|     1840|     87|                   2071|
| 2021/1/5 8:00|    2115|     2070|     91|                   2310|
| 2021/1/5 9:00|    2255|     2120|     95|                   2353|
|2021/1/5 10:00|    2242|     2090|     96|                   2333|
|2021/1/5 11:00|    2202|     2050|     95|     

In [15]:
df1.schema

StructType(List(StructField(DATETIME,StringType,true),StructField(Measured,IntegerType,true),StructField(Predicted,IntegerType,true),StructField(UseRate,IntegerType,true),StructField(EstimatedSupplyCapacity,IntegerType,true)))

In [45]:
df2 = df1.select(to_timestamp(df1['DATETIME'], 'yyyy/M/d H:mm').alias('timestamp'), '*')

In [46]:
df2.show()

+-------------------+--------------+--------+---------+-------+-----------------------+
|          timestamp|      DATETIME|Measured|Predicted|UseRate|EstimatedSupplyCapacity|
+-------------------+--------------+--------+---------+-------+-----------------------+
|2021-01-05 00:00:00| 2021/1/5 0:00|    1495|     1510|     80|                   1864|
|2021-01-05 01:00:00| 2021/1/5 1:00|    1446|     1440|     78|                   1832|
|2021-01-05 02:00:00| 2021/1/5 2:00|    1450|     1410|     79|                   1817|
|2021-01-05 03:00:00| 2021/1/5 3:00|    1440|     1400|     79|                   1820|
|2021-01-05 04:00:00| 2021/1/5 4:00|    1409|     1370|     76|                   1848|
|2021-01-05 05:00:00| 2021/1/5 5:00|    1419|     1360|     75|                   1884|
|2021-01-05 06:00:00| 2021/1/5 6:00|    1577|     1540|     80|                   1958|
|2021-01-05 07:00:00| 2021/1/5 7:00|    1822|     1840|     87|                   2071|
|2021-01-05 08:00:00| 2021/1/5 8

In [60]:
@udf(IntegerType())
def randamize(col):
    diff = random.randint(-30, 30)
    return col + diff

In [61]:
df3 = df2.select(randamize(df2['Measured']).alias('RMeasured'),
                randamize(df2['Predicted']).alias('RPredicted'),
                randamize(df2['EstimatedSupplyCapacity']).alias('REstimatedSupplyCapacity'), '*')

In [62]:
df3.show()

+---------+----------+------------------------+-------------------+--------------+--------+---------+-------+-----------------------+
|RMeasured|RPredicted|REstimatedSupplyCapacity|          timestamp|      DATETIME|Measured|Predicted|UseRate|EstimatedSupplyCapacity|
+---------+----------+------------------------+-------------------+--------------+--------+---------+-------+-----------------------+
|     1522|      1499|                    1854|2021-01-05 00:00:00| 2021/1/5 0:00|    1495|     1510|     80|                   1864|
|     1471|      1431|                    1843|2021-01-05 01:00:00| 2021/1/5 1:00|    1446|     1440|     78|                   1832|
|     1476|      1411|                    1793|2021-01-05 02:00:00| 2021/1/5 2:00|    1450|     1410|     79|                   1817|
|     1449|      1380|                    1803|2021-01-05 03:00:00| 2021/1/5 3:00|    1440|     1400|     79|                   1820|
|     1411|      1387|                    1856|2021-01-05 04:0

In [65]:
df4 = df3.select(df3['RMeasured']/df3['REstimatedSupplyCapacity'].alias('RUseRate'), '*')

In [66]:
df4.show()

+----------------------------------------------------+---------+----------+------------------------+-------------------+--------------+--------+---------+-------+-----------------------+
|(RMeasured / REstimatedSupplyCapacity AS `RUseRate`)|RMeasured|RPredicted|REstimatedSupplyCapacity|          timestamp|      DATETIME|Measured|Predicted|UseRate|EstimatedSupplyCapacity|
+----------------------------------------------------+---------+----------+------------------------+-------------------+--------------+--------+---------+-------+-----------------------+
|                                  0.7876857749469215|     1484|      1517|                    1884|2021-01-05 00:00:00| 2021/1/5 0:00|    1495|     1510|     80|                   1864|
|                                   0.771382463690156|     1434|      1452|                    1859|2021-01-05 01:00:00| 2021/1/5 1:00|    1446|     1440|     78|                   1832|
|                                  0.8059048660470203|     1474| 

In [67]:
df5 = df4.select('timestamp', 'RMeasured', 'RPredicted', 'REstimatedSupplyCapacity')

In [68]:
df5.show()

+-------------------+---------+----------+------------------------+
|          timestamp|RMeasured|RPredicted|REstimatedSupplyCapacity|
+-------------------+---------+----------+------------------------+
|2021-01-05 00:00:00|     1470|      1503|                    1854|
|2021-01-05 01:00:00|     1459|      1449|                    1850|
|2021-01-05 02:00:00|     1470|      1405|                    1791|
|2021-01-05 03:00:00|     1423|      1372|                    1793|
|2021-01-05 04:00:00|     1390|      1399|                    1825|
|2021-01-05 05:00:00|     1407|      1353|                    1867|
|2021-01-05 06:00:00|     1575|      1524|                    1978|
|2021-01-05 07:00:00|     1827|      1860|                    2088|
|2021-01-05 08:00:00|     2127|      2049|                    2287|
|2021-01-05 09:00:00|     2260|      2103|                    2359|
|2021-01-05 10:00:00|     2267|      2109|                    2315|
|2021-01-05 11:00:00|     2185|      2072|      

In [72]:
df5.write.format('delta').save(randomized_file)

                                                                                