# Load data from database

### create session

In [1]:

from pyspark.sql import SparkSession

jdbc_path = "/mnt/c/Users/user/Desktop/Quant-AI-Project/postgresql-42.7.1.jar"
spark = SparkSession.builder \
    .appName("ETA_Model_Training") \
    .config("spark.driver.memory", "4g") \
    .config("spark.jars", jdbc_path) \
    .getOrCreate()
print("‚úÖ Spark Session cr√©√©e")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/21 12:22:48 WARN Utils: Your hostname, DESKTOP-Q0IAP8C, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/21 12:22:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
26/01/21 12:22:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/21 12:22:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


‚úÖ Spark Session cr√©√©e


### load data

In [2]:

import os
from dotenv import load_dotenv

load_dotenv()

jdbc_url = f"jdbc:postgresql://localhost:5433/{os.getenv('DATABASE_NAME')}"
connection_properties = {
    "user": os.getenv('DATABASE_USER'),
    "password": os.getenv('DATABASE_PASSWORD'),
    "driver": "org.postgresql.Driver"
}
print("üì• Chargement des donn√©es Silver...")
df = spark.read.jdbc(
    url=jdbc_url,
    table="silver_table",
    properties=connection_properties
)

print(f"‚úÖ {df.count()} lignes charg√©es")
print(f"üìä Colonnes disponibles: {df.columns}")

# Afficher un aper√ßu
df.show(5)
df.printSchema()


üì• Chargement des donn√©es Silver...


                                                                                

‚úÖ 589 lignes charg√©es
üìä Colonnes disponibles: ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume', 'number_of_trades', 'taker_buy_base_volume', 'taker_buy_quote_volume', 'close_t_plus_10', 'return', 'MA_5', 'MA_10', 'taker_ratio']


                                                                                

+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+---------------+--------------------+-----------------+-----------------+-------------------+
|          open_time|    open|    high|     low|   close|  volume|          close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|close_t_plus_10|              return|             MA_5|            MA_10|        taker_ratio|
+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+---------------+--------------------+-----------------+-----------------+-------------------+
|2026-01-19 12:45:00|93159.02| 93172.0|93138.57| 93172.0| 8.92988|2026-01-19 12:45:...|    831779.0943744|            2330|              4.95472|        461507.3373702|       93057.17|1.392242920520014...|        9

### columns to drop 

In [3]:
df_silver = df.drop('open_time','close_time')

In [4]:
df_silver.columns

['open',
 'high',
 'low',
 'close',
 'volume',
 'quote_asset_volume',
 'number_of_trades',
 'taker_buy_base_volume',
 'taker_buy_quote_volume',
 'close_t_plus_10',
 'return',
 'MA_5',
 'MA_10',
 'taker_ratio']

In [5]:
# Define feature columns
feature_cols = [
    'open', 'high', 'low', 'close',              
    'volume', 'quote_asset_volume',               
    'number_of_trades',                           
    'taker_buy_base_volume',
      'taker_buy_quote_volume',  
    'return',                                    
      'MA_5', 
      'MA_10',                             
    'taker_ratio'                                 
]

target_col = 'close_t_plus_10'



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline


total_rows = df_silver.count()
train_size = int(0.8 * total_rows)

# Create sequential ID for splitting
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.orderBy(col("open"))  # Order by any column to maintain sequence
df_indexed = df_silver.withColumn("row_id", row_number().over(windowSpec))

train_df = df_indexed.filter(col("row_id") <= train_size)
test_df = df_indexed.filter(col("row_id") > train_size)

print(f"üìä Train set: {train_df.count():,} rows")
print(f"üìä Test set:  {test_df.count():,} rows")

# ============================================================================
# 4. FEATURE ENGINEERING PIPELINE
# ============================================================================

print("\nüîß Building ML Pipeline...")

# Step 1: Assemble features into a vector
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw",
    handleInvalid="skip"  # Skip rows with invalid values
)

# Step 2: Standardize features (zero mean, unit variance)
# This is crucial for convergence and feature importance
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,   # Scale to unit standard deviation
    withMean=True   # Center to zero mean
)

# Step 3: Choose a model
# Starting with Random Forest (good baseline for tabular data)
rf_model = RandomForestRegressor(
    featuresCol="features",
    labelCol=target_col,
    numTrees=100,           # Number of trees in the forest
    maxDepth=10,            # Maximum depth of each tree
    minInstancesPerNode=5,  # Minimum samples per leaf
    seed=42
)

# Alternative: Gradient Boosted Trees (often better but slower)
# gbt_model = GBTRegressor(
#     featuresCol="features",
#     labelCol=target_col,
#     maxIter=100,
#     maxDepth=5,
#     seed=42
# )

# Create the pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf_model])

# ============================================================================
# 5. TRAIN THE MODEL
# ============================================================================

print("\nüöÄ Training the model...")
print("‚è≥ This may take a few minutes...")

model = pipeline.fit(train_df)

print("‚úÖ Model training complete!")

# ============================================================================
# 6. EVALUATE ON TEST SET
# ============================================================================

print("\nüìà Evaluating model performance...")

# Make predictions
train_predictions = model.transform(train_df)
test_predictions = model.transform(test_df)

# Define evaluators
rmse_evaluator = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="rmse"
)

mae_evaluator = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="mae"
)

r2_evaluator = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="r2"
)

# Calculate metrics
train_rmse = rmse_evaluator.evaluate(train_predictions)
test_rmse = rmse_evaluator.evaluate(test_predictions)

train_mae = mae_evaluator.evaluate(train_predictions)
test_mae = mae_evaluator.evaluate(test_predictions)

train_r2 = r2_evaluator.evaluate(train_predictions)
test_r2 = r2_evaluator.evaluate(test_predictions)

print("\n" + "="*60)
print(" MODEL PERFORMANCE METRICS")
print("="*60)
print(f"\n{'Metric':<20} {'Train':<20} {'Test':<20}")
print("-"*60)
print(f"{'RMSE':<20} {train_rmse:<20.4f} {test_rmse:<20.4f}")
print(f"{'MAE':<20} {train_mae:<20.4f} {test_mae:<20.4f}")
print(f"{'R¬≤ Score':<20} {train_r2:<20.4f} {test_r2:<20.4f}")
print("="*60)



üìÖ Creating time-based train/test split...


26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


üìä Train set: 471 rows


26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


üìä Test set:  118 rows

üîß Building ML Pipeline...

üöÄ Training the model...
‚è≥ This may take a few minutes...


26/01/21 12:35:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
26/01/21 12:35:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance

‚úÖ Model training complete!

üìà Evaluating model performance...


26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1


üìä MODEL PERFORMANCE METRICS

Metric               Train                Test                
------------------------------------------------------------
RMSE                 48.9192              82.0583             
MAE                  36.5886              68.0081             
R¬≤ Score             0.9424               -0.1179             

üîç Feature Importance:
------------------------------------------------------------
MA_5                           0.2295 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
close                          0.2129 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
MA_10                          0.1970 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
low                            0.1760 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
open                           0.0730 ‚ñà‚ñà‚ñà
high                           0.0519 ‚ñà‚ñà
taker_buy_quote_volume         0.0119 
taker_buy_base_volume          0.0102 
quote_asset_volume             0.0083 
return                         0.0076 
number_of_trades               0.0075 
taker_ra

26/01/21 12:35:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 12:35:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+---------------+-----------------+
|close_t_plus_10|prediction       |
+---------------+-----------------+
|93032.68       |93099.52197944446|
|93018.59       |93089.69963562881|
|93233.99       |93120.6952963889 |
|93260.19       |93131.68674531745|
|93086.01       |93070.64936676902|
|93118.0        |93118.67330940478|
|93050.0        |93113.75958492066|
|93208.77       |93106.63857865082|
|93016.78       |93067.81523588445|
|93087.15       |93118.3391849603 |
+---------------+-----------------+


üíæ Saving model to: /mnt/c/Users/user/Desktop/Quant-AI-Project/ml/models/btc_price_predictor


                                                                                

‚úÖ Model saved successfully!

üéâ TRAINING COMPLETE!

üìù Next steps:
   1. Analyze feature importance to understand what drives predictions
   2. Try GBTRegressor for potentially better performance
   3. Tune hyperparameters using cross-validation
   4. Consider adding more technical indicators
   5. Implement walk-forward validation for more robust evaluation


In [None]:
# ============================================================================
# 9. SAVE THE MODEL
# ============================================================================

model_path = "/mnt/c/Users/user/Desktop/Quant-AI-Project/ml/models/btc_price_predictor"
print(f"\nüíæ Saving model to: {model_path}")

model.write().overwrite().save(model_path)
print("‚úÖ Model saved successfully!")
