In [1]:
from pyspark.sql import SparkSession
from time import process_time
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder \
    .config('spark.pyspark.python', '/usr/local/bin/python3.8') \
    .config('spark.pyspark.driver.python', '/usr/local/bin/python3.8') \
    .appName('dz10') \
    .master('local') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 19:11:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import pandas as pd
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

def generate_data(spark: SparkSession, csv_path: str, n, sep=','):
    df = pd.read_csv(csv_path, sep=sep)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)
    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(df)
    synthetic_data = synthesizer.sample(num_rows=n)
    return spark.createDataFrame(synthetic_data)


24/12/01 19:11:16 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [16]:
import time

for caching in (False, True):
    def do():
        data = generate_data(spark, 'electronic_devices.csv', 1000000)
        dt = data \
            .filter(col('purchase_date') == '2024-09-03') \
            .filter(F.size(F.split(col('addons'), ',')) == 1)
        if caching:
            dt = dt.cache()
        for group in (['gender'], ['age'], ['gender', 'age']): 
            for column in ('total_price', 'unit_price'):
                result = dt.groupBy(group) \
                    .agg(F.min(column), F.max(column)) \
                    .collect()
                print(result)
                
    begin = time.process_time()
    do()
    total = time.process_time() - begin
    print(f"processed in {total} caching={caching}")


24/12/01 19:35:13 WARN TaskSetManager: Stage 93 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.
24/12/01 19:35:17 WARN TaskSetManager: Stage 96 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(gender='Female', min(total_price)=25.88, max(total_price)=11396.8), Row(gender='Male', min(total_price)=21.96, max(total_price)=11396.8)]


                                                                                

[Row(gender='Female', min(unit_price)=123.81, max(unit_price)=1139.68), Row(gender='Male', min(unit_price)=123.85, max(unit_price)=1139.67)]


24/12/01 19:35:21 WARN TaskSetManager: Stage 99 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.
24/12/01 19:35:25 WARN TaskSetManager: Stage 102 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(age=26, min(total_price)=754.6, max(total_price)=9026.8), Row(age=29, min(total_price)=407.94, max(total_price)=7546.81), Row(age=65, min(total_price)=87.32, max(total_price)=6887.02), Row(age=19, min(total_price)=105.46, max(total_price)=6611.77), Row(age=54, min(total_price)=286.38, max(total_price)=9822.92), Row(age=22, min(total_price)=27.63, max(total_price)=8519.85), Row(age=77, min(total_price)=51.63, max(total_price)=7630.14), Row(age=34, min(total_price)=105.25, max(total_price)=8927.99), Row(age=50, min(total_price)=155.61, max(total_price)=8839.47), Row(age=57, min(total_price)=190.88, max(total_price)=9969.88), Row(age=32, min(total_price)=28.36, max(total_price)=9601.93), Row(age=43, min(total_price)=463.07, max(total_price)=9534.8), Row(age=31, min(total_price)=36.98, max(total_price)=8218.8), Row(age=39, min(total_price)=202.87, max(total_price)=7198.22), Row(age=25, min(total_price)=114.31, max(total_price)=7538.19), Row(age=71, min(total_price)=265.0, max(total_pr

24/12/01 19:35:28 WARN TaskSetManager: Stage 105 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(age=26, min(unit_price)=179.1, max(unit_price)=1139.67), Row(age=29, min(unit_price)=299.32, max(unit_price)=1118.98), Row(age=65, min(unit_price)=124.12, max(unit_price)=1133.13), Row(age=19, min(unit_price)=870.61, max(unit_price)=1078.74), Row(age=54, min(unit_price)=147.64, max(unit_price)=1131.14), Row(age=22, min(unit_price)=136.76, max(unit_price)=1128.1), Row(age=77, min(unit_price)=124.15, max(unit_price)=1135.67), Row(age=34, min(unit_price)=138.58, max(unit_price)=1134.0), Row(age=50, min(unit_price)=145.28, max(unit_price)=1108.37), Row(age=57, min(unit_price)=143.32, max(unit_price)=1134.28), Row(age=32, min(unit_price)=144.28, max(unit_price)=1132.76), Row(age=43, min(unit_price)=124.7, max(unit_price)=1133.83), Row(age=31, min(unit_price)=127.56, max(unit_price)=1135.71), Row(age=39, min(unit_price)=142.65, max(unit_price)=1133.57), Row(age=25, min(unit_price)=223.14, max(unit_price)=1132.29), Row(age=71, min(unit_price)=206.53, max(unit_price)=1137.83), Row(age=68,

                                                                                

[Row(gender='Female', age=32, min(total_price)=28.36, max(total_price)=8219.16), Row(gender='Male', age=78, min(total_price)=62.88, max(total_price)=9987.69), Row(gender='Female', age=44, min(total_price)=65.17, max(total_price)=9001.28), Row(gender='Male', age=74, min(total_price)=155.29, max(total_price)=8158.27), Row(gender='Female', age=39, min(total_price)=482.05, max(total_price)=6893.91), Row(gender='Male', age=61, min(total_price)=68.76, max(total_price)=10495.26), Row(gender='Male', age=59, min(total_price)=141.69, max(total_price)=7939.94), Row(gender='Female', age=68, min(total_price)=345.0, max(total_price)=11396.8), Row(gender='Female', age=50, min(total_price)=155.61, max(total_price)=7667.14), Row(gender='Female', age=27, min(total_price)=824.7, max(total_price)=6777.59), Row(gender='Female', age=51, min(total_price)=27.15, max(total_price)=8545.02), Row(gender='Female', age=77, min(total_price)=93.28, max(total_price)=7630.14), Row(gender='Female', age=36, min(total_pri

24/12/01 19:35:32 WARN TaskSetManager: Stage 108 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

[Row(gender='Female', age=32, min(unit_price)=144.28, max(unit_price)=1122.25), Row(gender='Male', age=78, min(unit_price)=235.41, max(unit_price)=1126.38), Row(gender='Female', age=44, min(unit_price)=141.23, max(unit_price)=1061.25), Row(gender='Male', age=74, min(unit_price)=200.06, max(unit_price)=1105.6), Row(gender='Female', age=39, min(unit_price)=145.19, max(unit_price)=1123.76), Row(gender='Male', age=61, min(unit_price)=141.21, max(unit_price)=1127.81), Row(gender='Male', age=59, min(unit_price)=127.49, max(unit_price)=1128.31), Row(gender='Female', age=68, min(unit_price)=142.93, max(unit_price)=1139.64), Row(gender='Female', age=50, min(unit_price)=145.28, max(unit_price)=1068.29), Row(gender='Female', age=27, min(unit_price)=165.97, max(unit_price)=1107.36), Row(gender='Female', age=51, min(unit_price)=123.81, max(unit_price)=1137.41), Row(gender='Female', age=77, min(unit_price)=129.15, max(unit_price)=1135.67), Row(gender='Female', age=36, min(unit_price)=124.65, max(uni

24/12/01 19:38:47 WARN TaskSetManager: Stage 111 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.
24/12/01 19:38:50 WARN TaskSetManager: Stage 112 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

[Row(gender='Female', min(total_price)=25.88, max(total_price)=11396.8), Row(gender='Male', min(total_price)=21.96, max(total_price)=11396.8)]


24/12/01 19:38:50 WARN TaskSetManager: Stage 115 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(gender='Female', min(unit_price)=123.81, max(unit_price)=1139.68), Row(gender='Male', min(unit_price)=123.85, max(unit_price)=1139.67)]


24/12/01 19:38:51 WARN TaskSetManager: Stage 118 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(age=26, min(total_price)=754.6, max(total_price)=9026.8), Row(age=29, min(total_price)=407.94, max(total_price)=7546.81), Row(age=65, min(total_price)=87.32, max(total_price)=6887.02), Row(age=19, min(total_price)=105.46, max(total_price)=6611.77), Row(age=54, min(total_price)=286.38, max(total_price)=9822.92), Row(age=22, min(total_price)=27.63, max(total_price)=8519.85), Row(age=77, min(total_price)=51.63, max(total_price)=7630.14), Row(age=34, min(total_price)=105.25, max(total_price)=8927.99), Row(age=50, min(total_price)=155.61, max(total_price)=8839.47), Row(age=57, min(total_price)=190.88, max(total_price)=9969.88), Row(age=32, min(total_price)=28.36, max(total_price)=9601.93), Row(age=43, min(total_price)=463.07, max(total_price)=9534.8), Row(age=31, min(total_price)=36.98, max(total_price)=8218.8), Row(age=39, min(total_price)=202.87, max(total_price)=7198.22), Row(age=25, min(total_price)=114.31, max(total_price)=7538.19), Row(age=71, min(total_price)=265.0, max(total_pr

24/12/01 19:38:51 WARN TaskSetManager: Stage 121 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(age=26, min(unit_price)=179.1, max(unit_price)=1139.67), Row(age=29, min(unit_price)=299.32, max(unit_price)=1118.98), Row(age=65, min(unit_price)=124.12, max(unit_price)=1133.13), Row(age=19, min(unit_price)=870.61, max(unit_price)=1078.74), Row(age=54, min(unit_price)=147.64, max(unit_price)=1131.14), Row(age=22, min(unit_price)=136.76, max(unit_price)=1128.1), Row(age=77, min(unit_price)=124.15, max(unit_price)=1135.67), Row(age=34, min(unit_price)=138.58, max(unit_price)=1134.0), Row(age=50, min(unit_price)=145.28, max(unit_price)=1108.37), Row(age=57, min(unit_price)=143.32, max(unit_price)=1134.28), Row(age=32, min(unit_price)=144.28, max(unit_price)=1132.76), Row(age=43, min(unit_price)=124.7, max(unit_price)=1133.83), Row(age=31, min(unit_price)=127.56, max(unit_price)=1135.71), Row(age=39, min(unit_price)=142.65, max(unit_price)=1133.57), Row(age=25, min(unit_price)=223.14, max(unit_price)=1132.29), Row(age=71, min(unit_price)=206.53, max(unit_price)=1137.83), Row(age=68,

24/12/01 19:38:52 WARN TaskSetManager: Stage 124 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(gender='Female', age=32, min(total_price)=28.36, max(total_price)=8219.16), Row(gender='Male', age=78, min(total_price)=62.88, max(total_price)=9987.69), Row(gender='Female', age=44, min(total_price)=65.17, max(total_price)=9001.28), Row(gender='Male', age=74, min(total_price)=155.29, max(total_price)=8158.27), Row(gender='Female', age=39, min(total_price)=482.05, max(total_price)=6893.91), Row(gender='Male', age=61, min(total_price)=68.76, max(total_price)=10495.26), Row(gender='Male', age=59, min(total_price)=141.69, max(total_price)=7939.94), Row(gender='Female', age=68, min(total_price)=345.0, max(total_price)=11396.8), Row(gender='Female', age=50, min(total_price)=155.61, max(total_price)=7667.14), Row(gender='Female', age=27, min(total_price)=824.7, max(total_price)=6777.59), Row(gender='Female', age=51, min(total_price)=27.15, max(total_price)=8545.02), Row(gender='Female', age=77, min(total_price)=93.28, max(total_price)=7630.14), Row(gender='Female', age=36, min(total_pri

24/12/01 19:38:52 WARN TaskSetManager: Stage 127 contains a task of very large size (71514 KiB). The maximum recommended task size is 1000 KiB.


[Row(gender='Female', age=32, min(unit_price)=144.28, max(unit_price)=1122.25), Row(gender='Male', age=78, min(unit_price)=235.41, max(unit_price)=1126.38), Row(gender='Female', age=44, min(unit_price)=141.23, max(unit_price)=1061.25), Row(gender='Male', age=74, min(unit_price)=200.06, max(unit_price)=1105.6), Row(gender='Female', age=39, min(unit_price)=145.19, max(unit_price)=1123.76), Row(gender='Male', age=61, min(unit_price)=141.21, max(unit_price)=1127.81), Row(gender='Male', age=59, min(unit_price)=127.49, max(unit_price)=1128.31), Row(gender='Female', age=68, min(unit_price)=142.93, max(unit_price)=1139.64), Row(gender='Female', age=50, min(unit_price)=145.28, max(unit_price)=1068.29), Row(gender='Female', age=27, min(unit_price)=165.97, max(unit_price)=1107.36), Row(gender='Female', age=51, min(unit_price)=123.81, max(unit_price)=1137.41), Row(gender='Female', age=77, min(unit_price)=129.15, max(unit_price)=1135.67), Row(gender='Female', age=36, min(unit_price)=124.65, max(uni

In [91]:
spark.stop()