<a href="https://colab.research.google.com/github/emmetorior/CN7030-/blob/main/Multifeature_Test_Classifier_from_original_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark pandas numpy



In [1]:
from pyspark.sql import SparkSession
# Importing package
from pyspark.sql.functions import (
    expr, col, lag, lead, window, stddev, mean, first, last,
    when, isnan, count, dayofmonth, month, year, to_date, udf
)
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
import pandas as pd
import numpy as np


In [2]:

# Create Spark Session
spark = SparkSession.builder.appName("CN7030 Assn 1 - Stock Price Classifier").getOrCreate()

# Define an explicit schema to ensure consistent types
schema = StructType([
    StructField("open", DoubleType(), True),
    StructField("close", DoubleType(), True),
    StructField("high", DoubleType(), True),
    StructField("low", DoubleType(), True),
    StructField("volume", DoubleType(), True),
    StructField("price_change_percentage", DoubleType(), True)
])

# Sample stock data with consistent double types
data = [
    (100.0, 102.0, 103.0, 99.0, 1000000.0, 2.5),
    (95.0, 93.0, 96.0, 92.0, 800000.0, -3.0),
    (110.0, 108.0, 112.0, 107.0, 1200000.0, -1.0),
    (105.0, 107.0, 108.0, 104.0, 950000.0, 1.5)
]


# Create DataFrame
#df = spark.createDataFrame(data)
df = spark.createDataFrame(data, schema=schema)

# Create label column
df = df.withColumn('price_change_label',
    when(col('price_change_percentage') < -2, 'low')
    .when((col('price_change_percentage') >= -2) & (col('price_change_percentage') < 2), 'medium')
    .otherwise('high')
)


In [None]:

# Prepare features
feature_columns = ['open', 'close', 'high', 'low', 'volume']
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol='features'
)
prepared_df = assembler.transform(df)

# Index labels
label_indexer = StringIndexer(
    inputCol='price_change_label',
    outputCol='label'
)
prepared_df = label_indexer.fit(prepared_df).transform(prepared_df)

# Split data
(train_data, test_data) = prepared_df.select('features', 'label').randomSplit([0.7, 0.3], seed=42)

# Train the Logistic Regression with max 10 iterations
lr = LogisticRegression(
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8,
    labelCol='label',
    featuresCol='features'
)
model = lr.fit(train_data)

# Evaluate the model
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(
    labelCol='label',
    predictionCol='prediction',
    metricName='accuracy'
)
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")




Model Accuracy: 1.0


In [None]:
# replace original columns here - like in the previous one. I just wrote this app with reduced features to make it easier to create.
prepared_df = prepared_df.withColumn('open', col('open')) \
    .withColumn('close', col('close')) \
    .withColumn('price_change_percentage', col('price_change_percentage')) \
    .withColumn('price_change_label', col('price_change_label'))

# Split into training and test sets
(train_data, test_data) = prepared_df.randomSplit([0.7, 0.3], seed=42)

# Training the log reg
lr = LogisticRegression(
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8,
    labelCol='label',
    featuresCol='features'
)
model = lr.fit(train_data)

# predict
predictions = model.transform(test_data)

# Defines the values - change this later, because we want a different type of label...
def map_prediction_to_label(prediction):
    if prediction == 0:
        return 'low'
    elif prediction == 1:
        return 'medium'
    else:
        return 'high'

# map prediction - user defined func ..
map_prediction_udf = udf(map_prediction_to_label, StringType())

# Use the UDF in the DataFrame
results_df = predictions.select(
    'open',
    'close',
    #
    'price_change_percentage',
    'price_change_label',
    # put prediction here?
    'prediction',
    map_prediction_udf(col('prediction')).alias('predicted_label')
)

# Display results
results_df.show()

+-----+-----+-----------------------+------------------+----------+---------------+
| open|close|price_change_percentage|price_change_label|prediction|predicted_label|
+-----+-----+-----------------------+------------------+----------+---------------+
|105.0|107.0|                    1.5|            medium|       0.0|            low|
+-----+-----+-----------------------+------------------+----------+---------------+



In [None]:

# Define the mapping function
def map_prediction_to_label(prediction):
    if prediction == 0:
        return 'low'
    elif prediction == 1:
        return 'medium'
    else:
        return 'high'

# Register the function as a UDF
map_prediction_udf = udf(map_prediction_to_label, StringType())

# Use the UDF in the DataFrame
results_df = predictions.select(
    'open',
    'close',
    'price_change_percentage',
    'price_change_label',
    'prediction',
    map_prediction_udf(col('prediction')).alias('predicted_label')
)

# Display results
results_df.show()

+-----+-----+-----------------------+------------------+----------+---------------+
| open|close|price_change_percentage|price_change_label|prediction|predicted_label|
+-----+-----+-----------------------+------------------+----------+---------------+
|105.0|107.0|                    1.5|            medium|       0.0|            low|
+-----+-----+-----------------------+------------------+----------+---------------+

