create a samll and big versions of toy example data

# 1. Simpson's Paradox
- Create toy data where the paradox is evident
- Fit a naïve regression where the relation is not intuitive
- Then show what happens when a confounder is added (direction is changed)
- Throwing all the variables in kind of works but specifying a causal DAG will get the estimates correctly

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, when, lit, col
import pyspark.sql.functions as F

In [None]:


# Initialize Spark session
spark = SparkSession.builder.appName("ToyExamplePySpark").getOrCreate()

# Set random seed
spark.sparkContext.setRandomSeed(853210)

# Create a DataFrame with the desired number of samples
n_samples = 2000
df = spark.range(n_samples)

# Generate toy data
df = df.withColumn("Winter_Ind", F.when(F.rand() < 0.24, 1).otherwise(0))

df = df.withColumn("Rain_Ind", 
                   F.when(F.rand() < (0.2 + col("Winter_Ind") * 0.3), 1).otherwise(0))

df = df.withColumn("Speed_KMpH", 
                   F.randn() * 0.7 + (60 - col("Rain_Ind") * 0.9))

df = df.withColumn("Fuel_Consumption_LpKM", 
                   F.randn() * 0.5 + (50 + col("Speed_KMpH") / 4 + col("Rain_Ind") * 2.1))

# Select only the columns we need
toy_example = df.select("Winter_Ind", "Rain_Ind", "Speed_KMpH", "Fuel_Consumption_LpKM")

# Show the first few rows
toy_example.show(5)

# Save toy_example as Parquet file locally
toy_example.write.parquet("toy_example.parquet")

# To use the saved Parquet file later, you can read it like this:
# loaded_toy_example = spark.read.parquet("toy_example.parquet")

# Stop the Spark session
spark.stop()