In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor, \
    GBTRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, isnan, col, lag, to_date, current_date, date_sub
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
from pyspark.sql.window import Window

In [None]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# constants
HDFS_PATH = 'hdfs://10.84.129.52:9000/trab/g05'
TICKERS = ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'V']
SCHEMA = StructType([
    StructField("Date", StringType(), True),
    StructField("Low", DoubleType(), True),
    StructField("Open", DoubleType(), True),
    StructField("Volume", LongType(), True),
    StructField("High", DoubleType(), True),
    StructField("Close", DoubleType(), True),
    StructField("Adjusted_Close", DoubleType(), True)
])
MODELS = {
    'LinearRegression': LinearRegression(featuresCol='scaledFeatures', labelCol='Next_Day_Close'),
    'GeneralizedLinearRegression': GeneralizedLinearRegression(featuresCol='scaledFeatures',
                                                               labelCol='Next_Day_Close'),
    'DecisionTreeRegressor': DecisionTreeRegressor(featuresCol='scaledFeatures', labelCol='Next_Day_Close'),
    'RandomForestRegressor': RandomForestRegressor(featuresCol='scaledFeatures', labelCol='Next_Day_Close'),
    'GBTRegressor': GBTRegressor(featuresCol='scaledFeatures', labelCol='Next_Day_Close')
}

In [None]:
dataframes = {}

def check_missing_values(df, ticker):
    missing_values = df.select(
        [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns])
    print(f"Missing values for {ticker}:")
    missing_values.show()

# Read the data from all CSV files in the directory for each ticker
for ticker in TICKERS:
    dataframes[ticker] = spark.read.csv(HDFS_PATH + '/data/' + ticker + '/*.csv', header=True, schema=SCHEMA)
    check_missing_values(dataframes[ticker], ticker)

In [None]:
def preprocess_data(ticker):
    df = dataframes[ticker]

    # Convert 'Date' column to datetime
    df = df.withColumn("Date", to_date(df["Date"], 'dd-MM-yyyy'))

    # Filter data from the last 10 years
    df = df.filter(df["Date"] >= date_sub(current_date(), 365 * 10))

    # Drop 'Adjusted_Close' column
    df = df.drop('Adjusted_Close')

    # Shift the 'Close' column up by one row to make the model predict for the next day
    window = Window.orderBy(df['Date'])
    df = df.withColumn('Next_Day_Close', lag(df['Close'], -1).over(window))

    # Drop the rows with null values that result from the shift
    df = df.dropna()

    # Assemble the features into a feature vector
    assembler = VectorAssembler(
        inputCols=["Low", "Open", "Volume", "High", "Close"],
        outputCol="features"
    )
    df = assembler.transform(df)

    # Normalize the features with MinMaxScaler
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scalerModel = scaler.fit(df)
    df = scalerModel.transform(df)

    # Update the DataFrame in the dictionary
    dataframes[ticker] = df

In [None]:
best_models = {}
predictions_dict = {}

# Save the model and overwrite if it already exists
def save_model(model, ticker):
    model.write().overwrite().save(HDFS_PATH + '/models/' + ticker + '_model')

# Split the data into training and testing sets
def split_data(df):
    train_size = int(df.count() * 0.75)
    train_data = df.limit(train_size)
    test_data = df.subtract(train_data)
    return train_data, test_data


# Train the models and make predictions
def train_predict(ticker):
    df = dataframes[ticker]

    train_data, test_data = split_data(df)

    best_model_name = None
    best_rmse = float('inf')

    for model_name, model in MODELS.items():
        # Define cross-validation
        crossval = CrossValidator(estimator=model,
                                  estimatorParamMaps=ParamGridBuilder().build(),  # empty parameter grid
                                  evaluator=RegressionEvaluator(labelCol="Next_Day_Close", predictionCol="prediction"),
                                  numFolds=10)  # 10-fold cross-validation

        # Run cross-validation, and choose the best set of parameters
        cvModel = crossval.fit(train_data)

        # Make predictions on the test data
        predictions = cvModel.transform(test_data)

        # Evaluate the model
        evaluator = RegressionEvaluator(labelCol="Next_Day_Close", predictionCol="prediction")
        rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

        if rmse < best_rmse:
            best_rmse = rmse
            best_model_name = model_name
            best_models[ticker] = cvModel.bestModel

    # Make predictions on the test data with the best model
    predictions = best_models[ticker].transform(test_data)

    # Store the predictions in the dictionary
    predictions_dict[ticker] = predictions

    # Evaluate the best model
    evaluator = RegressionEvaluator(labelCol="Next_Day_Close", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(f"Best model for {ticker} is {best_model_name} with performance:")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R2: {r2}\n")

    # Save the best model
    save_model(best_models[ticker], ticker)

In [None]:
def plot_predictions(ticker):
    # Get the predictions for the ticker
    predictions = predictions_dict[ticker]

    # Convert to Pandas DataFrame
    predictions_pd = predictions.select("Date", "Next_Day_Close", "prediction").toPandas()

    # Convert 'Date' column to datetime format
    predictions_pd['Date'] = pd.to_datetime(predictions_pd['Date'])

    # Set 'Date' as the index of the DataFrame
    predictions_pd.set_index('Date', inplace=True)

    # Plot actual vs predicted values
    plt.figure(figsize=(12, 6))
    plt.plot(predictions_pd['Next_Day_Close'], label='Actual')
    plt.plot(predictions_pd['prediction'], label='Predicted')
    plt.title(f'Actual vs Predicted Close Prices for {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()

In [None]:
for ticker in TICKERS:
    preprocess_data(ticker)
    train_predict(ticker)
    plot_predictions(ticker)

In [None]:
# Stop the Spark session
spark.stop()