In [0]:
import requests
from pyspark.sql.functions import col
import time
import uuid
from datetime import datetime
from pyspark.sql.types import DateType
from pyspark.sql.types import TimestampType

# Log Maintain

In [0]:
class LogTable:
    schemaName = 'FACT'
    tableName = 'log_table'     
    def load(load_type, status, load_to_table_name : str, comments='', created_by='Gokarna Adhikari'):
        id = str(uuid.uuid4())
        LogTable.load_table(     
            LogTable.schemaName,
            LogTable.schemaName, 
            id = id,
            load_type = load_type,
            table_name = load_to_table_name,
            process_start_time = str(datetime.now()),
            process_end_time = str(datetime.now()),
            status = status,
            comments = comments,
            start_date_time = str(datetime.now()),
            end_date_time = str(datetime.now()),
            created_on = str(datetime.now()),
            created_by = created_by
                      )
        return id
        
        
    def load_table(schema : str, table : str, **kwargs):
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema};")
        json_rdd = spark.sparkContext.parallelize([kwargs])
        df = spark.read.json(json_rdd)

        # converting to compatible type
        timestamp_cols = ["process_start_time", "process_end_time", "start_date_time", "end_date_time", "created_on"]
        for col_name in timestamp_cols:
            df = df.withColumn(col_name, col(col_name).cast(TimestampType()))

        df.write.format('delta').mode('append').option("mergeSchema", "true").saveAsTable(f"{schema}.{LogTable.tableName}")
        
        return kwargs['id']


# Function -> Loading clean data into table

In [0]:
df = spark.table('raw.raw_weather')
df = df.orderBy(df["created_on"].desc()).limit(5)
df_clean = df.select(
    col("data.dt").alias("dt"),
    col("data.id").cast("int").alias("id"),
    col("data.city").alias("city"),
    col("data.timezone").cast("int").alias("timezone"),
    col("data.sys.country").alias("country"),
    col("data.coord.lat").alias("lat"),
    col("data.coord.lon").alias("lon"),
    col("data.main.temp").alias("temp"),
    col("data.main.temp_min").alias("temp_min"),
    col("data.main.temp_max").alias("temp_max"),
    col("data.main.pressure").alias("pressure"),
    col("data.main.humidity").alias("humidity"),
    col("data.visibility").alias("visibility"),
    col("data.wind.speed").alias("speed"),
    col("data.wind.deg").alias("deg"),
    col("data.wind.gust").alias("gust"),
    col("id").alias("load_run_id"), #root level id
    col("created_on").alias("created_on"),
    col("created_by").alias("created_by")
)

In [0]:
def Clean_Data_Load():
    LogTable.load('CLEAN', 'STARTED', 'raw.raw_weather')
    table_name = 'clean_weather_data'
    schema = 'CLEAN'
    spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema}')
    df_clean.write.format('delta').mode('append').option("mergeSchema", "true").saveAsTable(f"{schema}.{table_name}")
    LogTable.load('CLEAN', 'COMPLETED', 'raw.raw_weather')

In [0]:
# Clean_Data_Load();

In [0]:
# %sql
# -- SELECT * FROM clean.clean_weather_data