# Dollar Bars (quoteQty)

*Dollar bars are the most stable of the 4 types.*

## Imports

In [1]:
import os
import yaml
import pandas as pd

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, IntegerType
from pyspark.sql.functions import col, sum, date_add, to_date, date_format, count, first, last, max, round, min, desc, row_number, last
from pyspark.sql.window import Window

pd.options.display.float_format = '{:,.2f}'.format

## Settings & variables

In [2]:
# Open the YAML file and load its contents into a dictionary
with open('../../../references/config_notebook.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Access the variables in the dictionary
my_vars = config

In [14]:
# Data location
source_folder_path = my_vars['TEST']

##  Load data from Parquet file

In [34]:
# Read the Parquet file into a DataFrame
df = pd.read_parquet(f'{source_folder_path}/BTCUSDT.parquet', engine='fastparquet')

# Set the timestamp column as the index
df.set_index('timestamp', inplace=True)

df.head()

Unnamed: 0_level_0,id,price,qty,quoteQty,makerBuy,bestPrice,zipname
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-03-29 00:00:00,3061645923,27261.06,0.0,42.53,True,True,20230329
2023-03-29 00:00:00,3061645924,27261.06,0.0,22.35,True,True,20230329
2023-03-29 00:00:00,3061645925,27261.07,0.0,92.69,False,True,20230329
2023-03-29 00:00:00,3061645926,27261.06,0.0,118.86,True,True,20230329
2023-03-29 00:00:00,3061645927,27261.06,0.0,41.16,True,True,20230329


## 1/50

*It is suggested that using 1/50 of the average daily dollar value, would result in more desirable statistical properties*


In [6]:
DAILY_AVG = df['quoteQty'].sum() / 50

NameError: name 'df' is not defined

## Aggregation

In [5]:
def add_unique_numbers(df, column_name, limit):
    """
    Adds unique numbers and cumulative sum results to a DataFrame based on a given limit.

    Parameters:
        df (pandas.DataFrame): The DataFrame to which unique numbers and results will be added.
        column_name (str): The name of the column containing the values for cumulative sum.
        limit (float): The limit at which the cumulative sum resets and a new unique number is assigned.

    Returns:
        pandas.DataFrame: A DataFrame with two additional columns:
            'Unique_Number': Contains unique numbers assigned based on the limit.
            'Resultat': Contains the cumulative sum results.
    """
    unique_number = 1
    unique_number_list = []
    resultat = []
    current_sum = 0
    
    for value in df[column_name]:
        current_sum += value
        unique_number_list.append(unique_number)
        resultat.append(current_sum)
        if current_sum >= limit:
            current_sum = 0
            unique_number += 1
    
    # Add columns to DataFrame
    df['Unique_Number'] = unique_number_list
    df['Resultat'] = resultat
    
    return df

In [37]:
result_df = add_unique_numbers(df, 'quoteQty', 2000)
result_df

Unnamed: 0_level_0,id,price,qty,quoteQty,makerBuy,bestPrice,zipname,Unique_Number,Resultat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-29 00:00:00,3061645923,27261.06,0.00,42.53,True,True,20230329,1,42.53
2023-03-29 00:00:00,3061645924,27261.06,0.00,22.35,True,True,20230329,1,64.88
2023-03-29 00:00:00,3061645925,27261.07,0.00,92.69,False,True,20230329,1,157.57
2023-03-29 00:00:00,3061645926,27261.06,0.00,118.86,True,True,20230329,1,276.43
2023-03-29 00:00:00,3061645927,27261.06,0.00,41.16,True,True,20230329,1,317.59
...,...,...,...,...,...,...,...,...,...
2023-03-29 23:59:59,3063350135,28348.60,0.00,72.57,True,True,20230329,390178,1882.35
2023-03-29 23:59:59,3063350136,28348.61,0.00,55.28,False,True,20230329,390178,1937.63
2023-03-29 23:59:59,3063350137,28348.61,0.00,32.60,False,True,20230329,390178,1970.23
2023-03-29 23:59:59,3063350138,28348.61,0.00,53.30,False,True,20230329,390178,2023.52


In [150]:
result_df_50 = add_unique_numbers(df, 'quoteQty', DAILY_AVG)
result_df_50

Unnamed: 0_level_0,id,price,qty,quoteQty,makerBuy,bestPrice,zipname,Unique_Number,Resultat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-03-29 00:00:00,3061645923,27261.06,0.00,42.53,True,True,20230329,1,42.53
2023-03-29 00:00:00,3061645924,27261.06,0.00,22.35,True,True,20230329,1,64.88
2023-03-29 00:00:00,3061645925,27261.07,0.00,92.69,False,True,20230329,1,157.57
2023-03-29 00:00:00,3061645926,27261.06,0.00,118.86,True,True,20230329,1,276.43
2023-03-29 00:00:00,3061645927,27261.06,0.00,41.16,True,True,20230329,1,317.59
...,...,...,...,...,...,...,...,...,...
2023-03-29 23:59:59,3063350135,28348.60,0.00,72.57,True,True,20230329,50,50140247.45
2023-03-29 23:59:59,3063350136,28348.61,0.00,55.28,False,True,20230329,50,50140302.73
2023-03-29 23:59:59,3063350137,28348.61,0.00,32.60,False,True,20230329,50,50140335.33
2023-03-29 23:59:59,3063350138,28348.61,0.00,53.30,False,True,20230329,50,50140388.63


## Database with defined average daily dollar value function

In [3]:
spark = SparkSession.builder.master("local[*]").appName('Dollars bars Database Creation')\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.driver.memory","4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.kryoserializer.buffer.max", "512m") \
        .getOrCreate()

24/05/05 17:59:44 WARN Utils: Your hostname, skynet resolves to a loopback address: 127.0.1.1; using 192.168.1.28 instead (on interface enxa44cc8c105af)
24/05/05 17:59:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 17:59:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.parquet(f'{source_folder_path}/BTCUSDT.parquet').orderBy('id')
df.show()

NameError: name 'source_folder_path' is not defined

In [3]:
sum_value = df.agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
DAILY_AVG = sum_value / 50

NameError: name 'df' is not defined

In [8]:
def add_unique_numbers(df, column_name, limit):
    """
    Adds unique numbers to a DataFrame based on a given limit. It will be used for further windowing processing.

    Parameters:
        df (pyspark.sql.DataFrame): The DataFrame to which unique numbers will be added.
        column_name (str): The name of the column containing the values for cumulative sum.
        limit (float): The limit at which the cumulative sum resets and a new unique number is assigned.

    Returns:
        pyspark.sql.DataFrame: A DataFrame with an additional column:
            'unique_number': Contains unique numbers assigned based on the limit.
    """

    # Initialize variables
    unique_number = 1
    unique_number_list = []
    current_sum = 0
    
    # Collect rows with 'id' and the specified column
    rows_looped = df.select('id', column_name).collect()

    # Loop through the collected rows
    for row in rows_looped:
        value = row[column_name]
        current_sum += value
        unique_number_list.append(unique_number)
        # Check if the cumulative sum exceeds the limit
        if current_sum >= limit:
            current_sum = 0
            unique_number += 1

    # Create DataFrame with unique numbers and 'id'
    group_data = list(zip(unique_number_list, [row['id'] for row in rows_looped]))
    schema = StructType([
        StructField("unique_number", IntegerType(), True),
        StructField("id", StringType(), True)
    ])
    group_df = spark.createDataFrame(group_data, schema)

    # Join the original DataFrame with the DataFrame containing unique numbers
    df = df.join(group_df, on="id").orderBy('id')

    return df

In [7]:
result_df = add_unique_numbers(df, "quoteQty", DAILY_AVG)
result_df.show()

+----------+--------+-------+------------+-------------------+--------+---------+--------+-------------+
|        id|   price|    qty|    quoteQty|          timestamp|makerBuy|bestPrice| zipname|unique_number|
+----------+--------+-------+------------+-------------------+--------+---------+--------+-------------+
|3061645923|27261.06|0.00156|  42.5272536|2023-03-29 07:00:41|    true|     true|20230329|            1|
|3061645924|27261.06| 8.2E-4|  22.3540692|2023-03-29 18:01:06|    true|     true|20230329|            1|
|3061645925|27261.07| 0.0034|   92.687638|2023-03-29 18:01:17|   false|     true|20230329|            1|
|3061645926|27261.06|0.00436| 118.8582216|2023-03-29 18:02:12|    true|     true|20230329|            2|
|3061645927|27261.06|0.00151|  41.1642006|2023-03-29 18:03:01|    true|     true|20230329|            3|
|3061645928|27261.07|0.00467| 127.3091969|2023-03-29 18:03:26|   false|     true|20230329|            3|
|3061645929|27261.07|0.00197|  53.7043079|2023-03-29 18

                                                                                

# dollars volume frame

In [5]:
def process_partition(partition_df, name_frame):

        window_spec = Window.partitionBy("Unique_Number").orderBy('id')

        w_desc = Window.partitionBy("Unique_Number").orderBy(desc('transactions_count'))

        processed_df = partition_df \
                .withColumn("timestamp_interval", last(col("timestamp")).over(window_spec)) \
                .withColumn("open", first(col("price")).over(window_spec)) \
                .withColumn("close", last(col("price")).over(window_spec)) \
                .withColumn("high", max(col("price")).over(window_spec)) \
                .withColumn("low", min(col("price")).over(window_spec))\
                .withColumn("qty_sum", sum(col("qty")).over(window_spec)) \
                .withColumn("quoteQty_sum", sum(col("quoteQty")).over(window_spec)) \
                .withColumn("transactions_count", count(col("id")).over(window_spec)) \
                .withColumn("max_quoteQty_sum", max(col("quoteQty")).over(window_spec)) \
                .withColumn("percentage_of_biggest_transaction", round(col("max_quoteQty_sum") / col("quoteQty_sum") * 100, 2)) \
                .withColumn(f"price_{name_frame}", round(col("quoteQty_sum") / col("qty_sum"), 2)) \
                .withColumn("rn_desc", row_number().over(w_desc)) \
                .filter("rn_desc == 1") \
                .withColumn("last_id", last(col("id")).over(window_spec)) \
                .select(
                        col("timestamp_interval").alias("timestamp"),
                        col("open"),
                        col("close"),
                        col("high"),
                        col("low"),
                        col("qty_sum").alias("volume"),
                        col(f"price_{name_frame}").alias("price"),
                        col("quoteQty_sum").alias("quoteQty"),
                        col("transactions_count"),
                        col("max_quoteQty_sum"),
                        col("percentage_of_biggest_transaction"),
                        col("last_id"),
                        col("zipname"),
                        ) \
                .dropDuplicates() \
                .withColumn("zipname", col("zipname").cast("string")) 
        return processed_df

In [9]:
process_partition(result_df, 'dollars bars').orderBy('timestamp').show()

+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|          timestamp|    open|   close|    high|     low|              volume|   price|          quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction|   last_id| zipname|
+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|2023-03-29 18:01:17|27261.06|27261.07|27261.07|27261.06|             0.00578|27261.07|       157.5689608|                 3|       92.687638|                            58.82|3061645925|20230329|
|2023-03-29 18:02:12|27261.06|27261.06|27261.06|27261.06|             0.00436|27261.06|       118.8582216|                 1|     118.8582216|                            100.0|3061645926|20230329|
|2023-03-29 18:

In [11]:
partitions = df.select("zipname").distinct().collect()

# Define schema for unprocessed DataFrame
unprocess_schema = StructType([
                                StructField("id", StringType(), True),
                                StructField("price", DoubleType(), True),
                                StructField("qty", DoubleType(), True),
                                StructField("quoteQty", DoubleType(), True),
                                StructField("timestamp", StringType(), True),
                                StructField("makerBuy", BooleanType(), True),
                                StructField("bestPrice", BooleanType(), True),
                                StructField("zipname", IntegerType(), True)
                                ])
# Create an empty DataFrame with the specified schema
unprocess_df = spark.createDataFrame([], schema=unprocess_schema)
name_frame='dollars_2'
avg_value = DAILY_AVG

In [14]:
for partition in partitions[0]:
        if unprocess_df.count() == 0:
                # Calculate daily average
                sum_value = df.filter(col("zipname") == partition) \
                                        .agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
                daily_avg = DAILY_AVG # sum_value / avg_value

                # Process partition DataFrame
                partition_df = df.filter(col("zipname") == partition)
                partition_df = add_unique_numbers(partition_df, "quoteQty", daily_avg)
                processed_df = process_partition(partition_df, name_frame)
                processed_df.orderBy('timestamp').show()

+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|          timestamp|    open|   close|    high|     low|              volume|   price|          quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction|   last_id| zipname|
+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|2023-03-29 18:01:17|27261.06|27261.07|27261.07|27261.06|             0.00578|27261.07|       157.5689608|                 3|       92.687638|                            58.82|3061645925|20230329|
|2023-03-29 18:02:12|27261.06|27261.06|27261.06|27261.06|             0.00436|27261.06|       118.8582216|                 1|     118.8582216|                            100.0|3061645926|20230329|
|2023-03-29 18:

In [15]:
# Get the last id transaction included in thas dollars bars
last_id = processed_df.filter(processed_df['quoteQty'] > daily_avg).orderBy(processed_df['last_id'].desc()).select('last_id').first()[0]

# Delete unprocced transactions in a the main dataframe partition 
processed_df = processed_df.filter((col('id') <= last_id) & (col('zipname')== partition)) 


# Transform zipname column to date format, add a day, and cast back to integer
unprocess_df = df.filter((col('id') > last_id) & (col('zipname')== partition) ) \
.withColumn("zipname", to_date(col("zipname").cast(StringType()), 'yyyyMMdd')) \
.withColumn("zipname", date_add(col("zipname"), 1)) \
.withColumn("zipname", date_format(col("zipname"), "yyyyMMdd").cast(IntegerType())) \
.orderBy("id")

print(last_id,unprocess_df.show())

+----------+--------+-------+----------+-------------------+--------+---------+--------+
|        id|   price|    qty|  quoteQty|          timestamp|makerBuy|bestPrice| zipname|
+----------+--------+-------+----------+-------------------+--------+---------+--------+
|3061645933|27261.06|0.00222|60.5195532|2023-03-29 18:05:05|    true|     true|20230330|
+----------+--------+-------+----------+-------------------+--------+---------+--------+

3061645932 None


In [16]:
for partition in partitions[1]:
        if unprocess_df.count() == 0:
                # Calculate daily average
                sum_value = df.filter(col("zipname") == partition) \
                                        .agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
                daily_avg = avg_value # sum_value / avg_value

                # Process partition DataFrame
                partition_df = df.filter(col("zipname") == partition)
                partition_df = add_unique_numbers(partition_df, "quoteQty", daily_avg)
                processed_df = process_partition(partition_df, name_frame)
                processed_df.orderBy('timestamp').show()

        else :
                # Calculate daily average
                sum_value = df.filter(col("zipname") == partition) \
                                        .agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
                daily_avg = avg_value # sum_value / avg_value
                
                # Union unprocessed DataFrame with partition DataFrame
                partition_df = unprocess_df.union(df.filter(col("zipname") == partition))
                partition_df = add_unique_numbers(partition_df, "quoteQty", daily_avg)
                processed_df = process_partition(partition_df, name_frame)

                # Clear unprocessed DataFrame
                unprocess_df = unprocess_df.limit(0)

                # Get the last id transaction included in thas dollars bars
                last_id = processed_df.filter(processed_df['quoteQty'] > daily_avg).orderBy(processed_df['last_id'].desc()).select('last_id').first()[0]

                # Delete unprocced transactions in a the main dataframe partition 
                processed_df = processed_df.filter((col('id') <= last_id) & (col('zipname')== partition)) 

                # Transform zipname column to date format, add a day, and cast back to integer
                unprocess_df = df.filter((col('id') > last_id) & (col('zipname')== partition) ) \
                .withColumn("zipname", to_date(col("zipname").cast(StringType()), 'yyyyMMdd')) \
                .withColumn("zipname", date_add(col("zipname"), 1)) \
                .withColumn("zipname", date_format(col("zipname"), "yyyyMMdd").cast(IntegerType())) \
                .orderBy("id")

In [17]:
print(last_id,unprocess_df.show())

+----------+--------+-------+----------+-------------------+--------+---------+--------+
|        id|   price|    qty|  quoteQty|          timestamp|makerBuy|bestPrice| zipname|
+----------+--------+-------+----------+-------------------+--------+---------+--------+
|3061645941|27261.07| 3.7E-4|10.0865959|2023-03-30 18:45:51|   false|     true|20230331|
|3061645942|27261.07|0.00239|65.1539573|2023-03-30 18:46:49|   false|     true|20230331|
+----------+--------+-------+----------+-------------------+--------+---------+--------+

3061645940 None


In [18]:
processed_df.orderBy('timestamp').show()

+-------------------+--------+--------+--------+--------+-------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|          timestamp|    open|   close|    high|     low| volume|   price|          quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction|   last_id| zipname|
+-------------------+--------+--------+--------+--------+-------+--------+------------------+------------------+----------------+---------------------------------+----------+--------+
|2023-03-30 18:41:50|27261.06|27261.07|27261.07|27261.06|0.00521|27261.07|142.03015249999999|                 2|      81.5105993|                            57.39|3061645934|20230330|
|2023-03-30 18:42:12|27261.07|27261.07|27261.07|27261.07|0.15165|27261.07|      4134.1412655|                 1|    4134.1412655|                            100.0|3061645935|20230330|
|2023-03-30 18:42:59|27261.07|27261.07|27261.07|27261.07|0.11999|27261.07|      

In [19]:
df.show()

+----------+--------+-------+------------+-------------------+--------+---------+--------+
|        id|   price|    qty|    quoteQty|          timestamp|makerBuy|bestPrice| zipname|
+----------+--------+-------+------------+-------------------+--------+---------+--------+
|3061645923|27261.06|0.00156|  42.5272536|2023-03-29 07:00:41|    true|     true|20230329|
|3061645924|27261.06| 8.2E-4|  22.3540692|2023-03-29 18:01:06|    true|     true|20230329|
|3061645925|27261.07| 0.0034|   92.687638|2023-03-29 18:01:17|   false|     true|20230329|
|3061645926|27261.06|0.00436| 118.8582216|2023-03-29 18:02:12|    true|     true|20230329|
|3061645927|27261.06|0.00151|  41.1642006|2023-03-29 18:03:01|    true|     true|20230329|
|3061645928|27261.07|0.00467| 127.3091969|2023-03-29 18:03:26|   false|     true|20230329|
|3061645929|27261.07|0.00197|  53.7043079|2023-03-29 18:03:36|   false|     true|20230329|
|3061645930|27261.06|0.00277|  75.5131362|2023-03-29 18:03:42|    true|     true|20230329|

In [10]:
def database_dollars_bars(source_folder_path, output_folder_path, name_frame, avg_value):
    """
    Process dollar bars data and save them to Parquet files.

    Args:
        source_folder_path (str): Path to the folder containing the input Parquet files.
        output_folder_path (str): Path to the folder where the processed Parquet files will be saved.
        name_frame (str): Name of the frame.
        avg_value (float): Daily binning value.

    Returns:
        None
    """
    # Initialize Spark session
    """spark = SparkSession.builder.appName("process_data")\
                .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
                .config("spark.driver.memory","4g") \
                .config("spark.executor.memory", "4g") \
                .config("spark.sql.session.timeZone", "UTC") \
                .getOrCreate()"""
    
    # Read input Parquet files
    df = spark.read.parquet(os.path.join(source_folder_path, "BTCUSDT.parquet"))

    # Get unique partitions based on 'zipname'
    partitions = df.select("zipname").orderBy("zipname").distinct().collect()
    partitions = sorted(partitions, key=lambda x: x.zipname)

    # Define schema for unprocessed DataFrame
    unprocess_schema = StructType([
                                    StructField("id", StringType(), True),
                                    StructField("price", DoubleType(), True),
                                    StructField("qty", DoubleType(), True),
                                    StructField("quoteQty", DoubleType(), True),
                                    StructField("timestamp", StringType(), True),
                                    StructField("makerBuy", BooleanType(), True),
                                    StructField("bestPrice", BooleanType(), True),
                                    StructField("zipname", IntegerType(), True)
                                    ])
    
    # Create an empty DataFrame with defined schema for unprocessed data
    unprocess_df = spark.createDataFrame([], schema=unprocess_schema)

    # Iterate over partitions
    for partition in partitions:
        # Process only if unprocessed DataFrame is empty
        if unprocess_df.count() == 0:
            # Calculate daily average
            sum_value = df.filter(col("zipname") == partition.zipname) \
                                  .agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
            daily_avg = sum_value / avg_value

            # Filter partition DataFrame based on 'zipname' and add unique numbers
            partition_df = df.filter(col("zipname") == partition.zipname)
            partition_df = add_unique_numbers(partition_df, "quoteQty", daily_avg)
            processed_df = process_partition(partition_df, name_frame)

            # Get the last 'id' transaction included in thas dollars bars where 'quoteQty' > daily_avg
            last_id = processed_df.filter((processed_df['quoteQty'] > daily_avg)).orderBy(processed_df['last_id'].desc()).select('last_id').first()[0]
            
            # Filter processed DataFrame based on conditions, delete unprocced transactions in a the main dataframe partition 
            processed_df = processed_df.filter((col('id') <= last_id) & (col('zipname')== partition.zipname)) 
            
            # Filter unprocessed DataFrame and prepare for next iteration
            unprocess_df = df.filter((col('id') > last_id) & (col('zipname') == partition.zipname) ) \
            .withColumn("zipname", to_date(col("zipname").cast(StringType()), 'yyyyMMdd')) \
            .withColumn("zipname", date_add(col("zipname"), 1)) \
            .withColumn("zipname", date_format(col("zipname"), "yyyyMMdd").cast(IntegerType())) \
            .orderBy("id")


        else :
            # Process if unprocessed DataFrame is not empty
            # Calculate daily average
            sum_value = df.filter(col("zipname") == partition.zipname) \
                                  .agg(sum(col('quoteQty').cast("float"))).collect()[0][0]
            daily_avg = sum_value / avg_value

            # Union unprocessed DataFrame with partition DataFrame
            partition_df = unprocess_df.union(df.filter(col("zipname") == partition.zipname))
            partition_df = add_unique_numbers(partition_df, "quoteQty", daily_avg)
            processed_df = process_partition(partition_df, name_frame)

            # Clear unprocessed DataFrame
            unprocess_df = unprocess_df.limit(0)

            # Get the last 'id' transaction included in thas dollars bars where 'quoteQty' > daily_avg
            last_id = processed_df.filter((processed_df['quoteQty'] > daily_avg)).orderBy(processed_df['last_id'].desc()).select('last_id').first()[0]

            # Filter processed DataFrame based on conditions, delete unprocced transactions in a the main dataframe partition 
            processed_df = processed_df.filter((col('id') <= last_id) & (col('zipname')== partition.zipname)) 

            # Filter unprocessed DataFrame and prepare for next iteration
            unprocess_df = df.filter((col('id') > last_id) & (col('zipname')== partition.zipname) ) \
            .withColumn("zipname", to_date(col("zipname").cast(StringType()), 'yyyyMMdd')) \
            .withColumn("zipname", date_add(col("zipname"), 1)) \
            .withColumn("zipname", date_format(col("zipname"), "yyyyMMdd").cast(IntegerType())) \
            .orderBy("id")

        # Save the data as a partitioned Parquet file based on the zip filename
        output_path = os.path.join(output_folder_path, f"BTCUSDT_{name_frame.replace(' ', '_')}.parquet")

        # Save processed data to output location
        processed_df.repartition(1).write \
            .partitionBy("zipname") \
            .mode("append") \
            .option("compression", "gzip") \
            .option("blockSize", "256m") \
            .parquet(output_path)

        # Stop Spark session
        # spark.stop()

In [11]:
source_folder_path = my_vars['DATA']['external']
output_folder_path = my_vars['DATA']['interim']
df = spark.read.parquet(os.path.join(source_folder_path, "BTCUSDT.parquet"))
partitions = df.select("zipname").orderBy("zipname").distinct().collect()
print(partitions)



[Row(zipname=20221109), Row(zipname=20230221), Row(zipname=20230317), Row(zipname=20230310), Row(zipname=20230222), Row(zipname=20230321), Row(zipname=20230313), Row(zipname=20230217), Row(zipname=20230216), Row(zipname=20230314), Row(zipname=20230320), Row(zipname=20221110), Row(zipname=20221108), Row(zipname=20230315), Row(zipname=20230223), Row(zipname=20230224), Row(zipname=20230316), Row(zipname=20221111), Row(zipname=20230311), Row(zipname=20230227), Row(zipname=20230318), Row(zipname=20230309), Row(zipname=20230209), Row(zipname=20230220), Row(zipname=20221114), Row(zipname=20230312), Row(zipname=20230319), Row(zipname=20221104), Row(zipname=20220819), Row(zipname=20230202), Row(zipname=20230301), Row(zipname=20230112), Row(zipname=20230114), Row(zipname=20220927), Row(zipname=20221026), Row(zipname=20220913), Row(zipname=20230215), Row(zipname=20220811), Row(zipname=20230228), Row(zipname=20221107), Row(zipname=20230125), Row(zipname=20210519), Row(zipname=20230303), Row(zipnam

                                                                                

In [13]:
for partition in partitions:
    print(partition.zipname)


20210301
20210302
20210303
20210304
20210305
20210306
20210307
20210308
20210309
20210310
20210311
20210312
20210313
20210314
20210315
20210316
20210317
20210318
20210319
20210320
20210321
20210322
20210323
20210324
20210325
20210326
20210327
20210328
20210329
20210330
20210331
20210401
20210402
20210403
20210404
20210405
20210406
20210407
20210408
20210409
20210410
20210411
20210412
20210413
20210414
20210415
20210416
20210417
20210418
20210419
20210420
20210421
20210422
20210423
20210424
20210425
20210426
20210427
20210428
20210429
20210430
20210501
20210502
20210503
20210504
20210505
20210506
20210507
20210508
20210509
20210510
20210511
20210512
20210513
20210514
20210515
20210516
20210517
20210518
20210519
20210520
20210521
20210522
20210523
20210524
20210525
20210526
20210527
20210528
20210529
20210530
20210531
20210601
20210602
20210603
20210604
20210605
20210606
20210607
20210608
20210609
20210610
20210611
20210612
20210613
20210614
20210615
20210616
20210617
20210618
20210619
2

In [11]:
source_folder_path = my_vars['DATA']['external']
output_folder_path = my_vars['DATA']['interim']
name_frame='dollars_bars_50'
avg_value = 50
database_dollars_bars(source_folder_path, output_folder_path, name_frame, avg_value)

24/05/05 18:10:02 WARN TaskSetManager: Stage 96 contains a task of very large size (4222 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:10:08 WARN TaskSetManager: Stage 119 contains a task of very large size (4222 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:10:30 WARN TaskSetManager: Stage 162 contains a task of very large size (3723 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:10:39 WARN TaskSetManager: Stage 194 contains a task of very large size (3723 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:10:59 WARN TaskSetManager: Stage 245 contains a task of very large size (4415 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:11:07 WARN TaskSetManager: Stage 277 contains a task of very large size (4415 KiB). The maximum recommended task size is 1000 KiB.
24/05/05 18:11:30 WARN TaskSetManager: Stage 326 contains a task of very large size (4512 KiB). The maximum recommended task size is 1000 KiB.


# Controls

In [12]:
df = spark.read.parquet(f'{output_folder_path}/BTCUSDT_{name_frame}.parquet')
df.orderBy('timestamp').show()



+-------------------+--------+--------+--------+--------+------------------+--------+-------------------+------------------+----------------+---------------------------------+---------+--------+
|          timestamp|    open|   close|    high|     low|            volume|   price|           quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction|  last_id| zipname|
+-------------------+--------+--------+--------+--------+------------------+--------+-------------------+------------------+----------------+---------------------------------+---------+--------+
|2021-03-01 00:18:56|45134.11|46164.34|46241.29|44950.53|1782.7989460000176|45549.75|8.120604375540431E7|             38358|   382149.400104|                             0.47|676231931|20210301|
|2021-03-01 00:37:58|46164.34| 46300.0| 46571.3|46031.02|1753.2660939999928|46297.14|8.117120081979156E7|             34941|        463996.2|                             0.57|676266872|20210301|
|2021-03-01 01:06:27|4629

                                                                                

In [13]:
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- price: double (nullable = true)
 |-- quoteQty: double (nullable = true)
 |-- transactions_count: long (nullable = true)
 |-- max_quoteQty_sum: double (nullable = true)
 |-- percentage_of_biggest_transaction: double (nullable = true)
 |-- last_id: string (nullable = true)
 |-- zipname: integer (nullable = true)



In [14]:
df.describe().show()



+-------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+---------------------------------+--------------------+-------------------+
|summary|          timestamp|              open|             close|              high|               low|            volume|             price|            quoteQty|transactions_count|  max_quoteQty_sum|percentage_of_biggest_transaction|             last_id|            zipname|
+-------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+---------------------------------+--------------------+-------------------+
|  count|              56363|             56363|             56363|             56363|             56363|             56363|             56363|               56363|  

                                                                                

In [15]:
df.orderBy('timestamp', ascending = False).show(5)



+-------------------+--------+--------+--------+--------+------------------+--------+--------------------+------------------+----------------+---------------------------------+----------+--------+
|          timestamp|    open|   close|    high|     low|            volume|   price|            quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction|   last_id| zipname|
+-------------------+--------+--------+--------+--------+------------------+--------+--------------------+------------------+----------------+---------------------------------+----------+--------+
|2024-03-31 23:57:28| 71330.0| 71350.0| 71350.0|71165.51| 383.7699900000063|71270.64|2.7351533974578552E7|             15250|    613746.27868|                             2.24|3523982993|20240331|
|2024-03-31 23:39:27| 71213.5| 71330.0| 71330.0|71207.32| 384.0481200000013|71287.11|2.7377679785620373E7|             10043|     716299.1072|                             2.62|3523967743|20240331|
|2024-03-31 23:

                                                                                

In [16]:
spark.stop()