### Classifiers

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator


spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
df = spark.read.csv("KO.csv", header=True, inferSchema=True) 

23/12/03 16:21:57 WARN Utils: Your hostname, Gias-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.100 instead (on interface en0)
23/12/03 16:21:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/03 16:21:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Buy - Hold - Sell
https://www.investopedia.com/articles/technical/073001.asp

Using the stochastics "D" line formula:
$D= 100 \bigg (\frac{H3}{L3} \bigg)$ 

Where, H3 is the Highest of the three previous stock prices and L3 is the lowest price in the same three day period.

In [2]:
df.take(1)

[Row(Date=datetime.date(1962, 1, 2), Open=0.263021, High=0.270182, Low=0.263021, Close=0.263021, Adj Close=0.048528, Volume=806400)]

In [3]:
windowSpec = Window().orderBy("Date")

# Calculate H3 and L3 using the window functions
df = df.withColumn("H3", F.max("Adj Close").over(windowSpec.rowsBetween(-2, 0)))
df = df.withColumn("L3", F.min("Adj Close").over(windowSpec.rowsBetween(-2, 0)))

# Calculate the D line
df = df.withColumn("D", 100 * (F.col("H3") / F.col("L3")))

# Create a binary class column based on the D line buy = 0, sell = 1
df = df.withColumn("Class", F.when(F.col("D") > 102, 1).otherwise(0))

# Show the resulting DataFrame
#df.show()

### Naive Bayes

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import functions as F


# Drop unnecessary columns
nb_df = df.drop("Date", "D", "label", "L3", "H3")

outcome = "Class"

# Compile all columns that are not the outcome
feature_cols = [col for col in nb_df.columns if col not in [outcome, "D", "label", "L3", "H3"]]

# Convert the "Class" column to a numeric type using StringIndexer
indexer = StringIndexer(inputCol=outcome, outputCol="label")
nb_df = indexer.fit(nb_df).transform(nb_df)

# Create a vector assembler
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="nb_features")

# Transform the DataFrame using the vector assembler
nb_df = vector_assembler.transform(nb_df)

# Split the data into training and testing sets
train_data, test_data = nb_df.randomSplit([0.8, 0.2], seed=42)

# Calculate class weights based on the class distribution
class_counts = train_data.groupBy("label").count()
total_count = train_data.count()

# Calculate class weights based on the class distribution
train_data = train_data.join(class_counts.withColumn("classWeight", F.col("count") / total_count), "label")

# Create a Naive Bayes model
naive_bayes = NaiveBayes(featuresCol="nb_features", labelCol="label", smoothing=0.001, weightCol="classWeight")

# Train the model
naive_bayes_model = naive_bayes.fit(train_data)

# Make predictions on the test set
predictions = naive_bayes_model.transform(test_data)

# Evaluate the model using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
area_under_roc = evaluator.evaluate(predictions)

# Print the area under ROC
print(f"Area under ROC: {area_under_roc}")
predictions.select("label", "prediction").show()


23/12/03 16:22:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 1

Area under ROC: 0.5375634792536022
+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  0.0|       1.0|
+-----+----------+
only showing top 20 rows



23/12/03 16:22:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/03 16:22:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [5]:
from pyspark.ml.classification import LinearSVC

svm_df = spark.read.csv("KO_class.csv", header=True, inferSchema=True) 
# Drop unnecessary columns
svm_df = svm_df.drop("label","Date", "H3", "L3", "D")


# Define outcome and feature columns
outcome = "Class"
feature_cols = [col for col in svm_df.columns if col not in outcome]

# Vector assembler for feature transformation
vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="svm_features")
svm_df = vector_assembler.transform(svm_df)

# Split data into training and testing sets
train_data, test_data = svm_df.randomSplit([0.8, 0.2], seed=42)

# Linear SVM model
svm_classifier = LinearSVC(featuresCol="svm_features", labelCol="Class", maxIter=10)
svm_model = svm_classifier.fit(train_data)

# Make predictions on the test set
svm_predictions = svm_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(svm_predictions)

# Print the accuracy
print(f"SVM Accuracy: {accuracy}")

# View predictions
svm_predictions.select("Class", "prediction").show(1)

SVM Accuracy: 0.6820412168792934
+-----+----------+
|Class|prediction|
+-----+----------+
|    1|       0.0|
+-----+----------+
only showing top 1 row



23/12/03 16:22:15 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
