<a href="https://colab.research.google.com/github/emmetorior/CN7030-/blob/main/original_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark pandas numpy


from pyspark.sql import SparkSession
# Importing package
from pyspark.sql.functions import (
    expr, col, lag, lead, window, stddev, mean, first, last,
    when, isnan, count
)
from pyspark.ml.linalg import Vectors
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("AAPL Stock Prediction") \
    .getOrCreate(	)

# Sample AAPL stock data - this func based on a chunk from google - it only contains the bog standard columns - no citation needed- this function will be deleted - because we'll be reading from the dataset
def create_sample_stock_data():
    # Generate a year of simulated stock data
    dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='B')
    np.random.seed(88)
#FAKE DATA!!
    data = {
        'Date': dates,
        'Volume': np.random.randint(1000000, 5000000, len(dates)),
        'High': 250 + np.cumsum(np.random.normal(0.1, 1, len(dates))),    # don't worry about the values, they won't be accurate
        'Low': 200 + np.cumsum(np.random.normal(-0.1, 1, len(dates))),
        'Close': 145 + np.cumsum(np.random.normal(0, 1, len(dates)))
    }

    df = pd.DataFrame(data)
    df.to_csv('AAPL.csv', index=False)  # ACHTUNG!! - delete me later
    return df

create_sample_stock_data()

df = spark.read.csv('AAPL.csv', header=True, inferSchema=True)      # infer the schema - for the moment anyway.

# Add timestamps - I need to rewrite this to set the formats directly - but I wasn't sure of which additional columns we would need.
df = df.withColumn("Date", col("Date").cast("timestamp"))


# 1. 9-day Moving Average
window_spec = Window.orderBy("Date").rowsBetween(-8, 0)
df = df.withColumn("9_Day_MA", mean("Close").over(window_spec))

# 2. Trading Range (High - Low)
df = df.withColumn("Trading_Range", col("High") - col("Low"))

# 3. Previous Day's Closing Price (using lag)
df = df.withColumn("Prev_Close", lag("Close").over(Window.orderBy("Date")))

# 4. QQQ Previous Close (simulated)
# Note: In real scenario, you'd load actual QQQ data
df = df.withColumn("QQQ_Prev_Close", col("Close") * 1.05)  # Simulated proxy

# 5. Standard Deviation from 25-day EMA
ema_window = Window.orderBy("Date").rowsBetween(-24, 0)
df = df.withColumn("25_Day_EMA", mean("Close").over(ema_window))

df = df.withColumn("EMA_Distance_StdDev",
  stddev(expr("abs(Close - `25_Day_EMA`)")).over(ema_window)
)

# drop nulls
df = df.na.drop()
#n = 30
#df.drop(index=df.index[:n], inplace=True)
print(df)
# Skip some days - I'm skipping
#df = df.iloc[10:]
df.show()

# Prepare Features and Label
feature_columns = [
    "9_Day_MA",
    "Trading_Range",
    "Prev_Close",
    "QQQ_Prev_Close",
    "EMA_Distance_StdDev"
]

# Target: Next day's closing price
#df = df.withColumn("Next_Close", lag(-1).over(Window.orderBy("Date")))
df = df.withColumn("Next_Close", lead("Close", 1).over(Window.orderBy("Date")))
# Assemble feature vector
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)
df = assembler.transform(df)

# Split Data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

#Train Linear Regression Model
lr = LinearRegression(
    featuresCol="features",
    labelCol="Next_Close",
    predictionCol="predicted_close"
)
model = lr.fit(train_data)

# Make Predictions
predictions = model.transform(test_data)

# Evaluate Model
evaluator = RegressionEvaluator(
    labelCol="Next_Close",
    predictionCol="predicted_close",
    metricName="rmse"
)
predictions.printSchema()
# Drop rows with null values in 'Next_Close' before evaluation - this was causing an error
predictions = predictions.na.drop(subset=["Next_Close"])

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error a.k.a RMSE: {rmse}")

# Display the sample predictions
predictions.select("Date", "Close", "Next_Close", "predicted_close").show()


DataFrame[Date: timestamp, Volume: int, High: double, Low: double, Close: double, 9_Day_MA: double, Trading_Range: double, Prev_Close: double, QQQ_Prev_Close: double, 25_Day_EMA: double, EMA_Distance_StdDev: double]
+-------------------+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|               Date| Volume|              High|               Low|             Close|          9_Day_MA|     Trading_Range|        Prev_Close|    QQQ_Prev_Close|        25_Day_EMA|EMA_Distance_StdDev|
+-------------------+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|2023-01-03 00:00:00|4250464|251.29751895683012|202.72325240823918|145.34681487398973|145.55648869524066| 48.57426654859094| 145.7661625164916|152.61415561768922|145.556488695240