In [1]:
import json
from datetime import date, timedelta
from functools import reduce
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import lpad, concat, col, lit, to_timestamp, udf, create_map
from pyspark.sql.types import IntegerType
from itertools import chain
import pandas as pd
import os
 

In [2]:
#rename columns
def renameDataframe(df):
    df = df.withColumnRenamed("Time-Kvarter","Time") \
        .withColumnRenamed("MålerVærdier", "MeterValue") \
        .withColumnRenamed("MålerEgenskab", "MeterType") \
        .withColumnRenamed("MålerArtBeskrivelse", "MeterDescribe") 
    return df

#preprocess dataframe
def processDataframe(df):
    df = df.withColumn("MålerVærdier", df["MålerVærdier"].cast("float")) \
        .withColumn("Time-Kvarter", lpad(df['Time-Kvarter'],4,'0')) \
        .withColumn('Datetime', concat(col('Dato'),lit('-'),col('Time-Kvarter'))) \
        .withColumn('FullAdresses', concat(col('InstallationAdresse'),lit(' '),col('InstallationPostNr'))) \
        .withColumn("Datetime", to_timestamp("Datetime", "yyyy-MM-dd-HHmm")) \
        .withColumn("Dato", to_timestamp("Dato", "yyyy-MM-dd")) 
    # df = df.drop("Dato")
    df = df.drop("Dataset")
    df = df.drop("InstallationAdresse")
    df = df.drop("InstallationPostNr")
    df = df.drop("MeterRegister")
    return df

In [14]:
#init spark app
spark = SparkSession.builder \
            .master("local") \
            .appName("Tref") \
            .config("spark.executor.memory", "4g") \
            .getOrCreate()


In [20]:
# clean parquet and store as csv
path = 'data/2020/monthly/'
files = os.listdir(path)

with open('data/address500.json') as f:
    addressList = json.load(f)

for index, file in enumerate(files):
    print(file)
    df = None
    df = spark.read.parquet(os.path.join(path,file))
    df = processDataframe(df)
    df = renameDataframe(df)
    df = df.filter(col('InstallationsID').isin(addressList))
    df = df.toPandas()
    df['FullAdresses'] = df['FullAdresses'].str.replace(',', ' ')
    df.to_csv(os.path.join(path,file[-14:-8]+'.csv'))