# 1 minute Database Creation

In [1]:
import os
import yaml

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, round, count, sum, max, window
from pyspark.sql.window import Window

## Initialise Spark Session 

In [2]:
spark = SparkSession.builder.master("local[*]").appName('1min Database Creation')\
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.driver.memory","4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.sql.session.timeZone", "UTC") \
        .getOrCreate()

24/04/06 12:56:31 WARN Utils: Your hostname, skynet resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp2s0)
24/04/06 12:56:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/06 12:56:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Variables

In [3]:
# Open the YAML file and load its contents into a dictionary
with open('../../references/config_notebook.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Access the variables in the dictionary
my_vars = config

## Database with defined timeframe function

In [4]:
def process_partition(partition_df, timeframe):
    window_spec = Window.partitionBy("timestamp_interval")

    # Read data from source location and process
    processed_df = partition_df.withColumn("timestamp_interval", window("timestamp", timeframe).start) \
            .withColumn("qty_sum", sum(col("qty")).over(window_spec)) \
            .withColumn("quoteQty_sum", sum(col("quoteQty")).over(window_spec)) \
            .withColumn("transactions_count", count(col("id")).over(window_spec)) \
            .withColumn("max_quoteQty_sum", max(col("quoteQty")).over(window_spec)) \
            .withColumn("percentage_of_biggest_transaction", round(col("max_quoteQty_sum") / col("quoteQty_sum") * 100, 2)) \
            .withColumn(f"price_{timeframe}", round(col("quoteQty_sum") / col("qty_sum"), 2)) \
            .select(
                col(f"price_{timeframe}").alias("price"),
                col("qty_sum").alias("qty"),
                col("quoteQty_sum").alias("quoteQty"),
                col("timestamp_interval").alias("timestamp"),
                col("transactions_count"),
                col("max_quoteQty_sum"),
                col("percentage_of_biggest_transaction")
            ).dropDuplicates(["price", "qty", "quoteQty", "timestamp", "transactions_count", "max_quoteQty_sum", "percentage_of_biggest_transaction"]) \
            .withColumn("zipname", date_format(col("timestamp"), "yyyyMMdd")) \
            .orderBy("timestamp")
    
    return processed_df



def database_timeframe(source_folder_path, output_folder_path, timeframe):
    """
    Process data from source location and save the processed data to output location to create a database with the interval defined.

    Parameters:
        source_location (str): Path to the input data source.
        output_location (str): Path to save the processed data.
        timeframe (str): Time interval in minutes for grouping the data. need to be valided with the pyspark window function

    Returns:
        None
    """
    # Initialize Spark session
    """spark = SparkSession.builder.appName("process_data")\
                .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
                .config("spark.driver.memory","4g") \
                .config("spark.executor.memory", "4g") \
                .config("spark.sql.session.timeZone", "UTC") \
                .getOrCreate()"""

    # Read data from source location
    df = spark.read.parquet(os.path.join(source_folder_path, "BTCUSDT.parquet"))
    
    # Group the dataframe by the partition column
    partitions = df.select("zipname").distinct().collect()

    for partition in partitions:
        partition_df = df.filter(col("zipname") == partition.zipname)
        processed_df = process_partition(partition_df, timeframe)
        
        # Save the data as a partitioned Parquet file based on the zip filename
        output_path = os.path.join(output_folder_path, f"BTCUSDT_{timeframe.replace(' ', '_')}.parquet")
        

        # Save processed data to output location
        processed_df.write \
            .partitionBy("zipname") \
            .mode("append") \
            .option("compression", "gzip") \
            .option("blockSize", "256m") \
            .parquet(output_path)

        # Stop Spark session
        # spark.stop()


## Generate the database with a defined timeframe

In [6]:
source_folder_path = my_vars["DATA"]["external"]
output_folder_path = my_vars["DATA"]["external"]
timeframe='1 second'
database_timeframe(source_folder_path, output_folder_path, timeframe)

                                                                                

# Controls

In [7]:
df = spark.read.parquet(f'{output_folder_path}/BTCUSDT_1_minute.parquet')
df.show()

                                                                                

+--------+------------------+------------------+-------------------+------------------+----------------+---------------------------------+--------+
|   price|               qty|          quoteQty|          timestamp|transactions_count|max_quoteQty_sum|percentage_of_biggest_transaction| zipname|
+--------+------------------+------------------+-------------------+------------------+----------------+---------------------------------+--------+
|33354.74| 74.62069999999994|2488954.2309403503|2021-06-09 00:00:00|              1588|  54981.37727429|                             2.21|20210609|
|33490.19| 90.64387200000004| 3035680.930924956|2021-06-09 00:01:00|              1717|  74799.75750877|                             2.46|20210609|
|33467.67|55.995943999999994|1874053.9825789297|2021-06-09 00:02:00|              1003|     106405.4811|                             5.68|20210609|
|33481.16| 39.98078699999992|1338602.9591662413|2021-06-09 00:03:00|               963|   54876.3919872|        

In [8]:
df.printSchema()

root
 |-- price: double (nullable = true)
 |-- qty: double (nullable = true)
 |-- quoteQty: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- transactions_count: long (nullable = true)
 |-- max_quoteQty_sum: double (nullable = true)
 |-- percentage_of_biggest_transaction: double (nullable = true)
 |-- zipname: integer (nullable = true)



In [9]:
df.describe().show()



+-------+------------------+------------------+--------------------+------------------+------------------+---------------------------------+------------------+
|summary|             price|               qty|            quoteQty|transactions_count|  max_quoteQty_sum|percentage_of_biggest_transaction|           zipname|
+-------+------------------+------------------+--------------------+------------------+------------------+---------------------------------+------------------+
|  count|           1621814|           1621814|             1621814|           1621814|           1621814|                          1621814|           1621814|
|   mean| 36020.10351938636| 70.25208871442777|  2020457.4650399673|1755.9275107996355| 76785.02199562339|                5.948698068952428|2.02228028347776E7|
| stddev|13590.425674918739|120.81812867913322|  2985708.5204539527|2308.3714501456234|130965.68276975716|               5.0777179637939245| 9241.306183481804|
|    min|          15511.42|            

                                                                                

In [9]:
spark.stop()