In [None]:
from matplotlib import pyplot as plt
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, isnan, col, lag, to_date, current_date, date_sub
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
from pyspark.sql.window import Window
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
spark = SparkSession.builder.getOrCreate()

# constants
HDFS_PATH = 'hdfs://10.84.129.52:9000/trab/g05'
TICKERS = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'V']
SCHEMA = StructType([
    StructField("Date", StringType(), True),
    StructField("Low", DoubleType(), True),
    StructField("Open", DoubleType(), True),
    StructField("Volume", LongType(), True),
    StructField("High", DoubleType(), True),
    StructField("Close", DoubleType(), True),
    StructField("Adjusted_Close", DoubleType(), True)
])

In [None]:
dataframes = {}


def check_missing_values(df, ticker):
    missing_values = df.select(
        [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
    print(f"Missing values for {ticker}:")
    missing_values.show()


# Read the data from all CSV files in the directory for each ticker
for ticker in TICKERS:
    dataframes[ticker] = spark.read.csv(f'{HDFS_PATH}/data/{ticker}/*.csv', header=True, schema=SCHEMA)
    check_missing_values(dataframes[ticker], ticker)

In [None]:
def split_data(df):
    train_size = int(df.count() * 0.75)
    train_set = df.limit(train_size)
    test_set = df.subtract(train_set)
    return train_set, test_set


# Preprocess the data, add new feature, split the data into training and testing sets, save scaler model
def preprocess_data(ticker):
    df = dataframes[ticker]

    # Convert 'Date' column to datetime
    df = df.withColumn("Date", to_date(df["Date"], 'dd-MM-yyyy'))

    # Filter data from the last 10 years
    df = df.filter(df["Date"] >= date_sub(current_date(), 365 * 10))

    # Sort the data by 'Date'
    df = df.orderBy("Date")

    # Drop 'Adjusted_Close' column
    df = df.drop('Adjusted_Close')

    # Create a new feature: difference between 'Close' and 'Open'
    df = df.withColumn('Close_Open_Diff', df['Close'] - df['Open'])

    # Add a new column 'Prev_Close' with the previous day's 'Close' price
    window_spec = Window.orderBy("Date")
    df = df.withColumn("Prev_Close", lag("Close").over(window_spec))
    df = df.na.drop()

    # Assemble the features into a feature vector
    assembler = VectorAssembler(
        inputCols=["Low", "Open", "Volume", "High", "Close_Open_Diff", "Prev_Close"],
        outputCol="features"
    )
    df = assembler.transform(df)

    # Split the data into training and testing sets
    train_set, test_set = split_data(df)

    # Normalize the features with MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scaler_model = scaler.fit(train_set)
    scaled_train_set = scaler_model.transform(train_set)
    scaled_test_set = scaler_model.transform(test_set)

    # Save the MinMaxScaler model
    scaler_model.write().overwrite().save(f'{HDFS_PATH}/models/{ticker}/{ticker}_scaler')

    return scaled_train_set, scaled_test_set

In [None]:
predictions_dict = {}


def cross_validate_model(model, train_set):
    # Define the parameter grid
    paramGrid = ParamGridBuilder() \
        .addGrid(model.regParam, [0.1, 0.01]) \
        .addGrid(model.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()

    # Define the cross-validation
    crossval = CrossValidator(estimator=model,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(labelCol="Close", predictionCol="prediction"),
                              numFolds=10)  # use 10-fold cross-validation

    # Train the model
    cvModel = crossval.fit(train_set)

    return cvModel


# Train the model and make predictions
def train_predict(train_set, test_set, ticker):
    # Define the model
    model = LinearRegression(featuresCol='scaledFeatures', labelCol='Close')

    # Call the cross_validate_model function
    cvModel = cross_validate_model(model, train_set)

    # Make predictions on the test data
    predictions = cvModel.transform(test_set)

    # Store predictions in the dictionary (used to plot the charts later)
    predictions_dict[ticker] = predictions

    # Evaluate the best model
    evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(f"Evaluation metrics for {ticker}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}\n")

    # Save the model
    cvModel.bestModel.write().overwrite().save(f'{HDFS_PATH}/models/{ticker}/{ticker}_model')

In [None]:
def plot_predictions(ticker):
    # Get the predictions for the ticker
    predictions = predictions_dict[ticker]

    # Convert to Pandas DataFrame
    predictions_pd = predictions.select("Date", "Close", "prediction").toPandas()

    # Set 'Date' as the index of the DataFrame
    predictions_pd.set_index('Date', inplace=True)

    # Plot actual vs predicted values
    plt.figure(figsize=(12, 6))
    plt.plot(predictions_pd['Close'], label='Actual')
    plt.plot(predictions_pd['prediction'], label='Predicted')
    plt.title(f'Actual vs Predicted Close Prices for {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()

In [None]:
for ticker in TICKERS:
    train_set, test_set = preprocess_data(ticker)
    train_predict(train_set, test_set, ticker)
    plot_predictions(ticker)

In [None]:
spark.stop()