# Project: Lunar Events

- Author:   Edgar Rios
- Date:     2025-01-17
- Version:  1.0

In [0]:
# ******************************************************************************************************************
# || DESCRIPTION
# || -------------------------------------------------------------------------------------------------------------
# || PROJECT       	: Lunar Events
# || FILE        	: data_simulation.ipynb
# || SOURCE         : 
# || TARGET         : /
# || OBJETIVE		: Create synthetic data of astronomical  events
# || Reprocess      : Yes
# || NOTES      	: TBD
# || SCHEDULER		: TBD
# || JOB			: TBD
# || VERSION  DEVELOPER	        PROVIDER              DATE			 DESCRIPTION
# || -------------------------------------------------------------------------------------------------------------
# || 	1	  EDGAR RIOS        SYNTHETIC       	  2025-01-17	Create synthetic data of lunar events
# || 	2	  EDGAR RIOS        SYNTHETIC       	  2025-01-19	Update number of rows
# ******************************************************************************************************************


## Import libaries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import uuid
from datetime import datetime, timedelta
import random

## Synthetic Data

In [0]:
# create a sample row
def generate_row ():
    event_types = ["lunar", "solar", "asteroid", "meteor", "comet"]
    locations = ["North_America", "South_America", "Europe", "Asia", "Africa", "Oceania"]
    
    return {
        "event_id": str(uuid.uuid4()),
        "event_type": random.choice(event_types),
        "timestamp": datetime.now() - timedelta(days=random.randint(0, 1095)),
        "location": random.choice(locations),
        "details": f"Event description {random.randint(1000, 9999)}"
    }

# create synthetic data
def generate_dataset(spark, num_rows=1000000):
    # Generamos datos en paralelo usando RDD
    rdd = spark.sparkContext.parallelize(range(num_rows)) \
        .map(lambda x: generate_row())
    
    # convert to DataFrame
    df = spark.createDataFrame(rdd)
        
    return df


def main():
# create session
    spark = SparkSession.builder.getOrCreate()
    
    # synthetic data around  50GB
    # by limits of databricks comunnity, we try with 5 GB
    # Nota: num_rows for size of dataset 500000000
    df = generate_dataset(spark, num_rows=500000)
    
    # save in parquet format
    df.write.mode("overwrite") \
        .partitionBy("timestamp") \
        .format("delta") \
        .saveAsTable("default.astronomical_events")

if __name__ == "__main__":
    main()
