In [7]:
number_of_baskets = 1000000 #Modify to generate a smaller or larger test dataset as preferred

In [2]:
import csv
from decimal import Decimal
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .enableHiveSupport() \
    .getOrCreate()
################################################################################
#Read file distribution_of_number_of_products_in_a_basket.csv that
# provides for each number N the tally of baskets that contain N products, that
# tally as a fraction of the whole, and the cumulative fraction
################################################################################
with open("distribution_of_number_of_products_in_a_basket.csv") as f:
    reader = csv.reader(f)
    distribution_of_number_of_products_in_a_basket = \
    [[int(row[0]), int(row[1]), Decimal(row[2]), Decimal(row[3])] for row in reader]

################################################################################
#Convert to a dataframe
################################################################################
schema = StructType([
    StructField("tally_of_products_per_basket", IntegerType(), True),
    StructField("tally_of_baskets_containing_products_tally", IntegerType(), True),
    StructField("fraction_of_baskets_containing_products_tally", DecimalType(20, 19), True),
    StructField("cumulative_fraction_of_baskets_containing_products_tally", DecimalType(20,19), True)
])
products_in_basket_df = spark.createDataFrame(distribution_of_number_of_products_in_a_basket, schema)
#Uncomment the following line to view products_in_basket_df
#products_in_basket_df.toPandas()

In [9]:
from pyspark.sql import Window
from pyspark.sql.functions import rand, lag, col
################################################################################
#Calculate lower & upper cumulative fraction boundary for each basket size
################################################################################
window = Window.orderBy('tally_of_products_per_basket')
basket_size_fraction_boundary_df = products_in_basket_df.select(
    'tally_of_products_per_basket',
    lag('cumulative_fraction_of_baskets_containing_products_tally', 1, 0).over(window)
        .alias('lower_bound'),
    col('cumulative_fraction_of_baskets_containing_products_tally').alias('upper_bound')
)
################################################################################
#Choose a basket size for each basket based on basket size distribution
################################################################################
baskets_df = spark.range(0, number_of_baskets).withColumn('rand', rand())
baskets_df = baskets_df.join(
    basket_size_fraction_boundary_df,
    (
        (baskets_df.rand >= basket_size_fraction_boundary_df.lower_bound) & 
        (baskets_df.rand < basket_size_fraction_boundary_df.upper_bound)
    )
).select(col('id').alias('basket_id'), 'tally_of_products_per_basket')