In [1]:
# typical
import os
import re
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, desc
from pyspark.sql.functions import col, lag
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
spark = (SparkSession
     .builder
     .master('local[*]')
     .getOrCreate())

Note: Check the `lab2.py` file for proof of concept.

# Executive Summary

# Introduction

# EDA

**Loading the File**

In [3]:
directory = '/mnt/data/public/binance-full-history'
binance = spark.read.parquet(directory)

Just to have a look at all the column names and see what columns are to be referenced when all the parquet files will be combined into a single dataframe.

In [4]:
binance.printSchema()

root
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- quote_asset_volume: float (nullable = true)
 |-- number_of_trades: integer (nullable = true)
 |-- taker_buy_base_asset_volume: float (nullable = true)
 |-- taker_buy_quote_asset_volume: float (nullable = true)
 |-- open_time: timestamp_ntz (nullable = true)



In [5]:
files = os.listdir(directory)

**Filtering the Files According to Coin Classification**

First, all the files were classified according to either three coin types: cyrptocurrency, stablecoin, and fiat-backed. Something to note is that filenames such as `BTC-USDT.parquet` were classified under the *base asset*, which refers to the first coin in the title (BTC), and the second coin (USDT) means that it's being expressed in terms of that pricing. So, the given example would be classified as a cryptocurrency.

In [6]:
# Base Classifications
cryptos = [
    'BTC','ETH','BNB','XRP','ADA','DOGE','SOL','MATIC','TRX','DOT','LTC','EOS','BCH','LINK','XLM','ATOM','ALGO','VET','FIL','NEO','IOTA','XTZ','ETC'
]
stablecoins = ['USDT','USDC','BUSD','TUSD','DAI','PAX','BIDR','IDRT']
fiats = ['EUR','GBP','AUD','TRY','BRL','RUB','NGN','UAH']

classified = {
    "cryptocurrency": [],
    "stablecoin": [],
    "fiat_backed": []
}

for f in files:
    pair = f.replace('.parquet','')
    base, quote = pair.split('-')
    
    if base in cryptos:
        classified["cryptocurrency"].append(f)
    elif base in stablecoins:
        classified["stablecoin"].append(f)
    elif base in fiats:
        classified["fiat_backed"].append(f)
    else:
        # unlisted base asset
        classified["cryptocurrency"].append(f)

In [7]:
classification = {}

for category, file_list in classified.items():
    for f in file_list:
        pair = f.replace(".parquet", "")
        classification[pair] = category

After getting the coin classifications of the files, all the parquet files were combined into a single dataframe and was saved into a CSV & parquet copy after execution.

In [None]:
# 1) Add stock column from filename
binance_with_stock = binance.withColumn(
    "stock",
    F.regexp_extract(F.input_file_name(), r'([^/]+)\.parquet$', 1)
)

# 2) Build proper classification DataFrame
class_rows = [
    (stock.replace(".parquet", ""), class_name)
    for class_name, stock_list in classification.items()
    for stock in stock_list
]
class_df = spark.createDataFrame(class_rows, ["stock", "classification"])

# 3) Window per token
w = Window.partitionBy("stock").orderBy("open_time")

final_spark_df = (
    binance_with_stock
    .join(class_df, on="stock", how="left")

    # split stock into base + quote
    .withColumn("base_currency", F.split(F.col("stock"), "-")[0])
    .withColumn("quote_currency", F.split(F.col("stock"), "-")[1])

    .withColumn("prev_close", F.lag("close").over(w))

    # returns
    .withColumn("return", (F.col("close") - F.col("prev_close")) / F.col("prev_close"))
    .withColumn("abs_return", F.abs(F.col("return")))

    # log returns
    .withColumn("log_return", F.log(F.col("close") / F.col("prev_close")))
    .withColumn("abs_log_return", F.abs(F.col("log_return")))

    .dropna(subset=["return", "log_return"])

    # drop prev_close if you don't need it anymore
    # .drop("prev_close")

    .select(
        "base_currency",
        "quote_currency",
        "classification",
        "open",
        "high",
        "low",
        "close",
        "volume",
        "quote_asset_volume",
        "number_of_trades",
        "taker_buy_base_asset_volume",
        "taker_buy_quote_asset_volume",
        "open_time",
        "return",
        "abs_return",
        "log_return",
        "abs_log_return"
    )
)

final_spark_df.show(5)

final_spark_df.write.mode("overwrite").parquet("/mnt/data/final_output.parquet")
final_spark_df.coalesce(1).write.mode("overwrite").option("header", True)\
    .csv("/mnt/data/final_output_csv")