# Project: Lunar Events

- Author:   Edgar Rios
- Date:     2025-01-19
- Version:  1.0

In [0]:
# ******************************************************************************************************************
# || DESCRIPTION
# || -------------------------------------------------------------------------------------------------------------
# || PROJECT       	: Lunar Events
# || FILE        	: data_process.ipynb
# || SOURCE         : 
# || TARGET         : /
# || OBJETIVE		: Process data of lunar events
# || Reprocess      : Yes
# || NOTES      	: TBD
# || SCHEDULER		: TBD
# || JOB			: TBD
# || VERSION  DEVELOPER	        PROVIDER              DATE			 DESCRIPTION
# || -------------------------------------------------------------------------------------------------------------
# || 	1	  EDGAR RIOS        SYNTHETIC       	  2025-01-19	Create synthetic data of lunar events
# || 	2	  EDGAR RIOS        SYNTHETIC       	  2025-01-19	change name dataset
# ******************************************************************************************************************


## Import libaries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import uuid
from datetime import datetime, timedelta
import random

## Synthetic Data

In [0]:
# load Delta Table
def load_data(metastore):
    df = spark.read.table(metastore)
    return df

def process_lunar_events(df):
    # filtred lunar events 
    lunar_events = df.filter(col("event_type") == "lunar")

    # add year column
    lunar_events = lunar_events.withColumn("year", year("timestamp"))  

    # group by year
    lunar_events = lunar_events.groupBy("year") \
        .agg(count("*").alias("total_lunar_events"))
    
    return lunar_events

def main():
    # create session
    spark = SparkSession.builder.getOrCreate()
    
    # load data 
    metastore="default.astronomical_events"
    df = load_data(metastore)

    # process lunar events
    yearly_summary = process_lunar_events(df)

    # save in parquet format
    yearly_summary.write.mode("overwrite") \
        .partitionBy("year") \
        .format("delta") \
        .option("mergeSchema", "true")\
        .saveAsTable("default.lunar_events")

if __name__ == "__main__":
    main()
