In [0]:
%pip install Faker
%restart_python

In [0]:
import pandas as pd
import random
from faker import Faker
from pyspark.sql import SparkSession

# Initialize Spark session


# Read customer ids from the table
customers_df = spark.table("finance_summit.ingestion.banking_customers").select("customer_id")

# Collect customer IDs to driver (assuming the dataset is not huge)
customer_ids = [row.customer_id for row in customers_df.collect()]

# Select every 10th customer
selected_customer_ids = customer_ids[9::10]  # 0-based index, so 9 is the 10th

# Initialize Faker and campaign details
fake = Faker()
campaign_names = ["Email", "Mobile", "Facebook", "Instagram", "Google Ads"]
outcomes = ["no interaction", "clicked", "accepted offer", "rejected offer"]

# Generate campaign data
campaign_data = []
for idx, customer_id in enumerate(selected_customer_ids):
    if customer_id % 1000 == 0:
        print(f"Generating data for customer {customer_id}") 
    campaign_id = 1000000 + idx
    campaign_name = random.choice(campaign_names)
    outcome = random.choice(outcomes)
    campaign_data.append({
        "campaign_id": campaign_id,
        "customer_id": customer_id,
        "campaign_name": campaign_name,
        "outcome": outcome
    })

# Create pandas DataFrame
campaign_df = pd.DataFrame(campaign_data)

# Convert pandas DataFrame to Spark DataFrame
campaign_spark_df = spark.createDataFrame(campaign_df)

# Write the final dataset to the target Delta table
campaign_spark_df.write.mode("overwrite").format("delta").saveAsTable("finance_summit.ingestion.campaign")


In [0]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import row_number, monotonically_increasing_id, hash, udf
from pyspark.sql.types import StringType, LongType
import random

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Read customer ids from the table
customers_df = spark.table("finance_summit.ingestion.banking_customers").select("customer_id")

# Add a synthetic partition column
num_partitions = 100  # Adjust as needed for your cluster size
customers_df = customers_df.withColumn("partition_id", (hash("customer_id") % num_partitions).cast("int"))

# Assign row numbers within each partition
window_spec = Window.partitionBy("partition_id").orderBy(monotonically_increasing_id())
customers_df = customers_df.withColumn("row_num", row_number().over(window_spec))

# Filter every 10th customer
filtered_df = customers_df.filter(customers_df.row_num % 10 == 0)

# UDFs for campaign_name and outcome
campaign_names = ["Email", "Mobile", "Facebook", "Instagram", "Google Ads"]
outcomes = ["no interaction", "clicked", "accepted offer", "rejected offer"]

@udf(StringType())
def random_campaign_name():
    return random.choice(campaign_names)

@udf(StringType())
def random_outcome():
    return random.choice(outcomes)

# Add campaign_id, campaign_name, and outcome columns
df_with_campaign = filtered_df.withColumn(
    "campaign_id", (1000000 + filtered_df.row_num).cast(LongType())
).withColumn(
    "campaign_name", random_campaign_name()
).withColumn(
    "outcome", random_outcome()
).select(
    "campaign_id", "customer_id", "campaign_name", "outcome"
)

# Write the final dataset to the target Delta table
df_with_campaign.write.mode("overwrite").format("delta").saveAsTable("finance_summit.ingestion.campaign")
