# Timeframe Database Creation

In [1]:
import os
import yaml

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round, count, sum, max, window, min, last, first
from pyspark.sql.window import Window

## Initialise Spark Session 

In [2]:
spark = SparkSession.builder.master("local[*]").appName('1min Database Creation')\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.driver.memory","4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.sql.session.timeZone", "UTC") \
        .getOrCreate()

24/04/07 13:07:50 WARN Utils: Your hostname, skynet resolves to a loopback address: 127.0.1.1; using 192.168.1.28 instead (on interface enxa44cc8c105af)
24/04/07 13:07:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/07 13:07:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Variables

In [3]:
# Open the YAML file and load its contents into a dictionary
with open('../../references/config_notebook.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Access the variables in the dictionary
my_vars = config

## Database with defined timeframe function

In [4]:
def process_partition(partition_df, timeframe):
    """
    Process partition DataFrame to calculate aggregated statistics within a given timeframe.

    Args:
        partition_df (DataFrame): DataFrame representing the partition to be processed.
        timeframe (str): Timeframe interval for windowing operations.

    Returns:
        DataFrame: Processed DataFrame with aggregated statistics.
    """

    # Define window specification
    window_spec = Window.partitionBy("timestamp_interval")
    
    # Apply window functions to calculate aggregated statistics
    processed_df = partition_df.orderBy('id')\
                    .withColumn("timestamp_interval", window("timestamp", timeframe).start) \
                    .withColumn("open", first(col("price")).over(window_spec)) \
                    .withColumn("close", last(col("price")).over(window_spec)) \
                    .withColumn("high", max(col("price")).over(window_spec))\
                    .withColumn("low", min(col("price")).over(window_spec))\
                    .withColumn("qty_sum", sum(col("qty")).over(window_spec)) \
                    .withColumn("quoteQty_sum", sum(col("quoteQty")).over(window_spec)) \
                    .withColumn("transactions_count", count(col("id")).over(window_spec)) \
                    .withColumn("max_quoteQty_sum", max(col("quoteQty")).over(window_spec)) \
                    .withColumn("percentage_of_biggest_transaction", round(col("max_quoteQty_sum") / col("quoteQty_sum") * 100, 2)) \
                    .withColumn(f"price_{timeframe}", round(col("quoteQty_sum") / col("qty_sum"), 2)) \
                    .select(
                            col("timestamp_interval").alias("timestamp"),
                            col("open"),
                            col("close"),
                            col("high"),
                            col("low"),
                            col("qty_sum").alias("volume"),
                            col(f"price_{timeframe}").alias("price"),
                            col("quoteQty_sum").alias("quoteQty"),
                            col("transactions_count"),
                            col("max_quoteQty_sum"),
                            col("percentage_of_biggest_transaction"),
                            col("zipname"),
                            ) \
                    .dropDuplicates() \
                    .withColumn("zipname", col("zipname").cast("string")) 
    return processed_df

In [5]:
def database_timeframe(source_folder_path, output_folder_path, timeframe):
    """
    Process data from source location and save the processed data to output location to create a database with the interval defined.

    Parameters:
        source_location (str): Path to the input data source.
        output_location (str): Path to save the processed data.
        timeframe (str): Time interval in minutes for grouping the data. need to be valided with the pyspark window function

    Returns:
        None
    """
    # Initialize Spark session
    """spark = SparkSession.builder.appName("process_data")\
                .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
                .config("spark.driver.memory","4g") \
                .config("spark.executor.memory", "4g") \
                .config("spark.sql.session.timeZone", "UTC") \
                .getOrCreate()"""

    # Read data from source location
    df = spark.read.parquet(os.path.join(source_folder_path, "BTCUSDT.parquet"))
    
    # Group the dataframe by the partition column
    partitions = df.select("zipname").distinct().collect()

    for partition in partitions:
        partition_df = df.filter(col("zipname") == partition.zipname)
        processed_df = process_partition(partition_df, timeframe)
        
        # Save the data as a partitioned Parquet file based on the zip filename
        output_path = os.path.join(output_folder_path, f"BTCUSDT_{timeframe.replace(' ', '_')}.parquet")

        # Save processed data to output location
        processed_df.write \
            .partitionBy("zipname") \
            .mode("append") \
            .option("compression", "gzip") \
            .option("blockSize", "256m") \
            .parquet(output_path)

        # Stop Spark session
        # spark.stop()


## Generate the database with a defined timeframe

In [6]:
source_folder_path = my_vars["TEST"]
output_folder_path = my_vars["TEST"]
timeframe='1 second'
database_timeframe(source_folder_path, output_folder_path, timeframe)

                                                                                

# Controls

In [7]:
df = spark.read.parquet(f'{output_folder_path}/BTCUSDT_1_second.parquet')
df.show()

+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+--------+
|          timestamp|    open|   close|    high|     low|              volume|   price|          quoteQty|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction| zipname|
+-------------------+--------+--------+--------+--------+--------------------+--------+------------------+------------------+----------------+---------------------------------+--------+
|2023-03-29 00:00:05|27262.38|27262.38|27262.38|27262.37| 0.04384000000000001|27262.38|1195.1826605999997|                17|     183.7484412|                            15.37|20230329|
|2023-03-29 00:00:06|27262.37|27262.38|27262.38|27262.37| 0.41868999999999995|27262.37|     11414.4832293|                21|     2472.696959|                            21.66|20230329|
|2023-03-29 00:00:08|27266.64|27266.63|27266.64|27266.63| 0.1169600000

In [8]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- price: double (nullable = true)
 |-- quoteQty: double (nullable = true)
 |-- transactions_count: long (nullable = true)
 |-- max_quoteQty_sum: double (nullable = true)
 |-- percentage_of_biggest_transaction: double (nullable = true)
 |-- zipname: integer (nullable = true)



In [9]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------------------+-----------+
|summary|              open|             close|              high|               low|            volume|             price|          quoteQty|transactions_count|  max_quoteQty_sum|percentage_of_biggest_transaction|    zipname|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------------------------+-----------+
|  count|             83171|             83171|             83171|             83171|             83171|             83171|             83171|             83171|             83171|                            83171|      83171|
|   mean|28098.613981916747|28098.626424354577| 28099.00647713739|28098.232973392136|1.07592

In [10]:
spark.stop()