# Synthetic Events

In [None]:
from components.ingestion.synthetic.event_synthetic import EventSynthetic

## Init Component

In [None]:
root_path = "/opt/dev"

general_config = f"{root_path}/pipe_configs/configurations/general_config.ini"
component_config = f"{root_path}/pipe_configs/configurations/event/event_synthetic.ini"

In [None]:
# Start Component
event_synthetic = EventSynthetic(general_config, component_config)
event_synthetic.spark

## Main

In [None]:
from core.data_objects.bronze.bronze_event_data_object import BronzeEventDataObject
import random

# MVP - Synthetic
spark = event_synthetic.spark
schema = event_synthetic.output_data_objects[BronzeEventDataObject.ID].SCHEMA

n_agents = event_synthetic.config.getint('EventSynthetic', 'n_agents')
n_events = event_synthetic.config.getint('EventSynthetic', 'n_events')
seed = event_synthetic.config.getint('EventSynthetic', 'seed')

sdate = event_synthetic.config.get('EventSynthetic', 'dates_string')
year = sdate[:4]
month = sdate[4:6]
day = sdate[6:8]


data = []
for _ in range(n_agents):
    
    # Empty dataframe
    # df = spark.sparkContext.emptyRDD().toDF(schema)

    # Random user_id
    # random.seed(seed)
    user_id = random.randbytes(32)

    for _ in range(n_events):
        # Random timestamp in path
        random_date = f"{year}-{month}-{day}T{random.randint(0, 23):02}:{random.randint(0, 59):02}:{random.randint(0, 59):02}"
        # random mcc
        random_mcc = random.randint(0, 999)
        # random cell_id
        cell_id = f"{random_mcc:03}" + "".join([str(random.randint(0, 9)) for _ in range(12)])

        # latitude
        latitude = None
        # longitude
        longitude = None
        # loc_error
        loc_error = None
        # Create row
        row = [user_id, random_date, random_mcc, cell_id, latitude, longitude, loc_error]
        # Append row to data
        data.append(row)


df = spark.createDataFrame(data, schema)
df.show()

In [None]:
event_do = event_synthetic.output_data_objects[BronzeEventDataObject.ID]
sdate = event_synthetic.config.get('EventSynthetic', 'dates_string')
path = f"{event_do.default_path}/{sdate}"
print(path)

In [None]:
event_do.df = df
event_do.write(path)

# Event cleaning

In [None]:
from components.execution.event_cleaning.event_cleaning import EventCleaning
from core.data_objects.bronze.bronze_event_data_object import BronzeEventDataObject
from core.data_objects.silver.silver_event_data_object import SilverEventDataObject

In [None]:
root_path = "/opt/dev"

general_config = f"{root_path}/pipe_configs/configurations/general_config.ini"
component_config = f"{root_path}/pipe_configs/configurations/event/event_cleaning.ini"

In [None]:
event_cleaning = EventCleaning(general_config, component_config)

In [None]:
do = event_cleaning.input_data_objects[BronzeEventDataObject.ID]
sdate = event_cleaning.config.get(EventCleaning.COMPONENT_ID, 'dates_string')
path = f"{do.default_path}/{sdate}"
print(path)

In [None]:
do.read(path)
df = do.df
df.show()

In [None]:
df.printSchema()

In [None]:
import pyspark.sql.functions as psf
date_format = "yyyy-MM-ddTHH:mm:ss"

# Parse timestamp to timestamp
df = df.withColumn('timestamp', psf.to_timestamp('timestamp'))

# Extract year month day 
df = df.withColumns({
    'year': psf.year('timestamp'),
    'month': psf.month('timestamp'),
    'day': psf.dayofmonth('timestamp'),
})

In [None]:
df.printSchema()

In [None]:
do_silver = event_cleaning.output_data_objects[SilverEventDataObject.ID]
do_silver.df = df

In [None]:
event_cleaning.write()