In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession\
    .builder\
    .master('local[*]')\
    .appName('load')\
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/09 16:15:21 WARN Utils: Your hostname, Giordano, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/09 16:15:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 16:15:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
train_df = spark.read.parquet(
    "../../datalake/serving/train_bovespa_plano_real.parquet"
)
test_df  = spark.read.parquet(
    "../../datalake/serving/test_bovespa_plano_real.parquet"
)

                                                                                

In [None]:
train_df = train_df.withColumnRenamed("Market_Type_idx", "label")
test_df = test_df.withColumnRenamed("Market_Type_idx", "label")

lr = LogisticRegression(featuresCol="features", labelCol="label")
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxDepth=10)

lr_model = lr.fit(train_df)
rf_model = rf.fit(train_df)
dt_model = dt.fit(train_df)

lr_preds = lr_model.transform(test_df)
rf_preds = rf_model.transform(test_df)
dt_preds = dt_model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
results = {
    "LogisticRegression": evaluator.evaluate(lr_preds),
    "RandomForest": evaluator.evaluate(rf_preds),
    "GBT": evaluator.evaluate(dt_preds)
}

for model, acc in results.items():
    print(f"{model} Accuracy: {acc:.4f}")

25/11/09 16:15:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/09 16:15:34 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/11/09 16:15:50 WARN MemoryStore: Not enough space to cache rdd_213_3 in memory! (computed 49.8 MiB so far)
25/11/09 16:15:50 WARN BlockManager: Persisting block rdd_213_3 to disk instead.
25/11/09 16:15:50 WARN MemoryStore: Not enough space to cache rdd_213_0 in memory! (computed 74.7 MiB so far)
25/11/09 16:15:50 WARN BlockManager: Persisting block rdd_213_0 to disk instead.
25/11/09 16:15:50 WARN MemoryStore: Not enough space to cache rdd_213_4 in memory! (computed 74.7 MiB so far)
25/11/09 16:15:50 WARN BlockManager: Persisting block rdd_213_4 to disk instead.
25/11/09 16:15:50 WARN MemoryStore: Not enough space to cache rdd_213_5 in memory! (computed 74.7 MiB so far)
25/11/09 16:15:50 WAR

SparkRuntimeException: [USER_RAISED_EXCEPTION] Labels MUST be in {0, 1}, but got 2.0 SQLSTATE: P0001

In [None]:
lr_model.write().overwrite().save("./models/bovespa_lr")
rf_model.write().overwrite().save("./models/bovespa_rf")
dt_model.write().overwrite().save("./models/bovespa_gbt")