# Feature Engeneering

### load data 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BinancePipeline") \
    .getOrCreate()

bronze_path = '/mnt/c/Users/user/Desktop/Quant-AI-Project/ml/data/btc_minute_data.parquet'
df_bronze = spark.read.parquet(bronze_path)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/20 11:55:21 WARN Utils: Your hostname, DESKTOP-Q0IAP8C, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/20 11:55:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/20 11:55:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

### Creating new columns 

In [2]:
df_bronze.columns

['open_time',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'close_time',
 'quote_asset_volume',
 'number_of_trades',
 'taker_buy_base_volume',
 'taker_buy_quote_volume',
 'ignore']

### Creating target column

In [4]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# Définir la fenêtre ordonnée par temps
window = Window.orderBy("open_time")

df_bronze = df_bronze.withColumn("close_t_plus_10", F.lead("close", 10).over(window))


In [6]:
df_bronze.show(20)

26/01/20 12:27:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:27:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:27:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:27:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:27:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+------+---------------+
|          open_time|    open|    high|     low|   close|  volume|          close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|ignore|close_t_plus_10|
+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+------+---------------+
|2026-01-19 05:24:00| 92628.0|92634.77|92595.62|92613.99|11.19022|2026-01-19 05:24:...|   1036457.1385852|            2100|              4.05786|        375821.0803997|     0|       92610.45|
|2026-01-19 05:25:00|92613.99|92613.99|92613.98|92613.99| 1.81514|2026-01-19 05:25:...|    168107.3438968|             196|              0.42396|         39264.6272004|     0|       92626.37|
|2026-01-19 05:26:00|92613.99|92627.42|9

                                                                                

### Creating features values 

###  -- Variations de prix (returns) :

In [None]:
df_bronze = df_bronze.withColumn("return", 
                   (F.col("close") - F.lag("close", 1).over(window)) / F.lag("close", 1).over(window))

### -- Moyennes mobiles (5, 10 minutes)

In [8]:

# Définir les fenêtres pour les moyennes mobiles
window_5 = Window.orderBy("open_time").rowsBetween(-4, 0)  # 5 dernières minutes (incluant la ligne actuelle)
window_10 = Window.orderBy("open_time").rowsBetween(-9, 0)  # 10 dernières minutes

df_bronze = df_bronze.withColumn("MA_5", F.avg("close").over(window_5))
df_bronze = df_bronze.withColumn("MA_10", F.avg("close").over(window_10))

###  -- Volume et intensité de trading

In [9]:


df_bronze = df_bronze.withColumn("taker_ratio", 
                   F.col("taker_buy_base_volume") / F.col("volume"))

# Afficher le schéma pour vérifier
df_bronze.printSchema()
df_bronze.show(20)

root
 |-- open_time: timestamp_ntz (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- close_time: timestamp_ntz (nullable = true)
 |-- quote_asset_volume: double (nullable = true)
 |-- number_of_trades: long (nullable = true)
 |-- taker_buy_base_volume: double (nullable = true)
 |-- taker_buy_quote_volume: double (nullable = true)
 |-- ignore: string (nullable = true)
 |-- close_t_plus_10: double (nullable = true)
 |-- return: double (nullable = true)
 |-- MA_5: double (nullable = true)
 |-- MA_10: double (nullable = true)
 |-- taker_ratio: double (nullable = true)



26/01/20 12:34:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:34:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:34:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:34:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:34:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+------+---------------+--------------------+-----------------+-----------------+--------------------+
|          open_time|    open|    high|     low|   close|  volume|          close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|ignore|close_t_plus_10|              return|             MA_5|            MA_10|         taker_ratio|
+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+------+---------------+--------------------+-----------------+-----------------+--------------------+
|2026-01-19 05:24:00| 92628.0|92634.77|92595.62|92613.99|11.19022|2026-01-19 05:24:...|   1036457.1385852|            2100|              4.05786|        375821.0803997|     0|       92610.45

### Inspection of new columns 

In [None]:
# Check for nulls
from pyspark.sql.functions import col, sum as spark_sum, when

null_counts = df_bronze.select([
    spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) 
    for c in df_bronze.columns
])
null_counts.show()

26/01/20 12:57:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:57:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:57:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 12:57:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+
|open_time|open|high|low|close|volume|close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|ignore|close_t_plus_10|return|MA_5|MA_10|taker_ratio|
+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+
|        0|   0|   0|  0|    0|     0|         0|                 0|               0|                    0|                     0|     0|             10|     1|   0|    0|          0|
+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+



### Filter null values in return and taker_ratio

In [11]:
df_clean = df_bronze.filter(
    F.col("close_t_plus_10").isNotNull() &
    F.col("return").isNotNull()
)

### Inspection

In [14]:
# Check for nulls
from pyspark.sql.functions import col, sum as spark_sum, when

null_counts = df_clean.select([
    spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) 
    for c in df_clean.columns
])
null_counts.show()

26/01/20 14:54:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:54:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:54:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:54:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+
|open_time|open|high|low|close|volume|close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|ignore|close_t_plus_10|return|MA_5|MA_10|taker_ratio|
+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+
|        0|   0|   0|  0|    0|     0|         0|                 0|               0|                    0|                     0|     0|              0|     0|   0|    0|          0|
+---------+----+----+---+-----+------+----------+------------------+----------------+---------------------+----------------------+------+---------------+------+----+-----+-----------+



In [None]:
df_clean.select(
    "return",
    "volume",
    "taker_ratio"
).describe().show()

26/01/20 14:58:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:58:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:58:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 14:58:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+--------------------+-----------------+--------------------+
|summary|              return|           volume|         taker_ratio|
+-------+--------------------+-----------------+--------------------+
|  count|                 589|              589|                 589|
|   mean|6.322452823649601E-6|7.631501375212217|  0.4735568467761747|
| stddev|3.301872357716786E-4|9.187781875007202|  0.2635065544475721|
|    min|-0.00119641588735...|          0.34639|0.010339518841469436|
|    max|0.001675970198852...|        114.47682|  0.9824354637429719|
+-------+--------------------+-----------------+--------------------+



In [22]:
df_clean.approxQuantile(
    ["return","number_of_trades","volume","quote_asset_volume","taker_buy_base_volume","taker_buy_quote_volume"],
    [0.001, 0.01, 0.99, 0.999],
    0.0
)

26/01/20 16:25:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 16:25:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 16:25:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/20 16:25:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[[-0.0011964158873544453,
  -0.000838974226931051,
  0.0009138941718540535,
  0.0016759701988522955],
 [140.0, 159.0, 7481.0, 10588.0],
 [0.34639, 0.64855, 47.64365, 114.47682],
 [32220.7112985, 60332.9231809, 4419511.6002973, 10649349.5676361],
 [0.05203, 0.16601, 23.77074, 47.7156],
 [4841.0483547, 15373.1435572, 2216626.4374976, 4446268.7158743]]

In [20]:
df_clean.columns

['open_time',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'close_time',
 'quote_asset_volume',
 'number_of_trades',
 'taker_buy_base_volume',
 'taker_buy_quote_volume',
 'ignore',
 'close_t_plus_10',
 'return',
 'MA_5',
 'MA_10',
 'taker_ratio']

In [23]:
df_silver = df_clean.drop('ignore')

### save data to database 