# Load data from database

### create session

In [1]:

from pyspark.sql import SparkSession

jdbc_path = "/mnt/c/Users/user/Desktop/Quant-AI-Project/postgresql-42.7.1.jar"
spark = SparkSession.builder \
    .appName("ETA_Model_Training") \
    .config("spark.driver.memory", "4g") \
    .config("spark.jars", jdbc_path) \
    .getOrCreate()
print("âœ… Spark Session crÃ©Ã©e")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/21 16:26:51 WARN Utils: Your hostname, DESKTOP-Q0IAP8C, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/21 16:26:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
26/01/21 16:26:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


âœ… Spark Session crÃ©Ã©e


### load data

In [2]:

import os
from dotenv import load_dotenv

load_dotenv()

jdbc_url = f"jdbc:postgresql://localhost:5433/{os.getenv('DATABASE_NAME')}"
connection_properties = {
    "user": os.getenv('DATABASE_USER'),
    "password": os.getenv('DATABASE_PASSWORD'),
    "driver": "org.postgresql.Driver"
}
print("ðŸ“¥ Chargement des donnÃ©es Silver...")
df = spark.read.jdbc(
    url=jdbc_url,
    table="silver_table",
    properties=connection_properties
)

print(f"âœ… {df.count()} lignes chargÃ©es")
print(f"ðŸ“Š Colonnes disponibles: {df.columns}")

# Afficher un aperÃ§u
df.show(5)
df.printSchema()


ðŸ“¥ Chargement des donnÃ©es Silver...


                                                                                

âœ… 589 lignes chargÃ©es
ðŸ“Š Colonnes disponibles: ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume', 'number_of_trades', 'taker_buy_base_volume', 'taker_buy_quote_volume', 'close_t_plus_10', 'return', 'MA_5', 'MA_10', 'taker_ratio']
+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+---------------+--------------------+-----------------+-----------------+-------------------+
|          open_time|    open|    high|     low|   close|  volume|          close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|close_t_plus_10|              return|             MA_5|            MA_10|        taker_ratio|
+-------------------+--------+--------+--------+--------+--------+--------------------+------------------+----------------+---------------------+----------------------+---------------+----------------

In [3]:
# Define feature columns
feature_cols = [
    'open', 'high', 'low', 'close',              
    'volume', 'quote_asset_volume',               
    'number_of_trades',                           
    'taker_buy_base_volume',
      'taker_buy_quote_volume',  
    'return',                                    
      'MA_5', 
      'MA_10',                             
    'taker_ratio'                                 
]

target_col = 'close_t_plus_10'



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

df = df.orderBy("open_time")

total_rows = df.count()
train_size = int(total_rows * 0.8)

window = Window.orderBy("open_time")
df = df.withColumn("row_id", row_number().over(window))

train_df = (
    df.filter(col("row_id") <= train_size)
      .drop("row_id", "open_time", "close_time")
)

test_df = (
    df.filter(col("row_id") > train_size)
      .drop("row_id", "open_time", "close_time")
)

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

rf_model = RandomForestRegressor(
    featuresCol="features",
    labelCol=target_col,
    numTrees=200,
    maxDepth=10,
    minInstancesPerNode=5,
    seed=42
)

pipeline = Pipeline(stages=[
    assembler,
    rf_model
])
model = pipeline.fit(train_df)


test_pred = model.transform(test_df)

rmse = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="rmse"
)

mae = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="mae"
)

r2 = RegressionEvaluator(
    labelCol=target_col,
    predictionCol="prediction",
    metricName="r2"
)

print("\n" + "=" * 65)
print(" MODEL PERFORMANCE")
print("=" * 65)

print(f"Train RMSE : {rmse.evaluate(train_pred):.6f}")
print(f"Test  RMSE : {rmse.evaluate(test_pred):.6f}")

print(f"Train MAE  : {mae.evaluate(train_pred):.6f}")
print(f"Test  MAE  : {mae.evaluate(test_pred):.6f}")

print(f"Train RÂ²   : {r2.evaluate(train_pred):.6f}")
print(f"Test  RÂ²   : {r2.evaluate(test_pred):.6f}")

print("=" * 65)

ðŸš€ Training model...


26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1

âœ… Training completed

 MODEL PERFORMANCE


26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Train RMSE : 45.128437


26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1

Test  RMSE : 94.674201


26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Train MAE  : 32.535088


26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1

Test  MAE  : 78.609769


26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Train RÂ²   : 0.963716


26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 16:33:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/21 1

Test  RÂ²   : -0.019829


In [None]:
model_path = "/mnt/c/Users/user/Desktop/Quant-AI-Project/ml/models/btc_price_predictor"
print(f"\nðŸ’¾ Saving model to: {model_path}")

model.write().overwrite().save(model_path)
print("âœ… Model saved successfully!")
