In [303]:
import csv
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import LinearRegression
from tensorflow import keras
from sklearn.metrics import mean_squared_error


### Data formatting

In [304]:
sp500_data = []
data = {}
date = {}

with open('sp500wiki.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        sp500_data.append(row)
        data[row['Symbol']] = [] # create the empty list for each symbol(each company symbol is key, value is the associated data)
    file.close()

columnNames = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'News - All News Volume', 'News - Volume']

with open('data.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        index = len(data[row['Symbol']]) # convert date to index
        date[row['Date']] = index
        row['Date'] = index
        data_list = []
        for col in columnNames:
            data_list.append(row[col]) # do not add data with many '0' values
        data[row['Symbol']].append(data_list) # key is symbol and value is the row with significant features
    file.close()

# remove rows with any empty values
for symbol_data in data.values():
    for row in symbol_data:
        for col in columnNames:
            if row[columnNames.index(col)] == '':
                symbol_data.remove(row)
                columnNames.remove(col)

# calculate 5-day moving average values for 4 features
def calculate_moving_average(feature, data):
    for symbol_data in data.values():
        for index in range(len(symbol_data)):
            if index > 4:
                values = []
                i = 1
                while i <= 5:
                    values.append(float(symbol_data[index-i][columnNames.index(feature)]))
                    i += 1
                MV = sum(values)/5
                symbol_data[index].append(MV)

calculate_moving_average('Open', data)
calculate_moving_average('High', data)
calculate_moving_average('Low', data)
calculate_moving_average('Close', data)

# removing rows without 5-day moving average values
for symbol_data in data.values():
    del symbol_data[:5]

# changing string to float
for symbol_data in data.values():
    for row in symbol_data:
        for i in range(len(row)):
            row[i] = float(row[i])

In [309]:
# making X dataset and y dataset for specific symbol and 4 features(open, high, low, close)
def createData(symbol, value, data):
    symbolData = []
    for symbols, symbol_data in data.items():
        if symbols == symbol:
            symbolData = symbol_data
            break

    xData = []
    yData = []

    itr = len(symbolData)
    if value == 'Open':
        index = 0
    elif value == 'High':
        index = 1
    elif value == 'Low':
        index = 2
    elif value == 'Close':
        index = 3

    for i in range(itr-1, 0, -1):
        # adding the 1-day lag value for all  featues except the 5-day moving average value of one of the 4 features(open, high, low, close)
        xRow = symbolData[i - 1][0:8] + [symbolData[i][index - 4]]
        # adding the current day value of one of the 4 features(open, high, low, close)
        yVal = symbolData[i][index]
        xData.append(xRow)
        yData.append(yVal)

    return(xData, yData)

### Linear Regression

In [343]:
MLRscaler = MinMaxScaler()

def linear_regression(symbol, value, data):
    model = LinearRegression()

    xData, yData = createData(symbol, value, data)

    # training data 0.8, testing data 0.2
    split_range = lambda x: int(len(x) * 0.8)
    x_train, x_test = xData[:split_range(xData)], xData[split_range(xData):]
    y_train, y_test = yData[:split_range(yData)], yData[split_range(yData):]
    
    # Normalize the input data (x_train and x_test)
    x_train = MLRscaler.fit_transform(x_train)
    x_test = MLRscaler.transform(x_test)

    # Reshape the y data for the scaler
    y_train = np.array(y_train).reshape(-1, 1)
    y_test = np.array(y_test).reshape(-1, 1)

    # Normalize the output data (y_train and y_test)
    y_train = MLRscaler.fit_transform(y_train)
    y_test = MLRscaler.transform(y_test)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    
    return model



In [341]:

def mlr_predict(symbol, desired_date, value, model):
    desired_index = date[desired_date]

    x, y = createData(symbol, value, data)
    desired_data = []
    for row in x:
        if row[0] == desired_index:
            desired_data = row
            break

    # Normalize the input features using the same MinMaxScaler used during training
    desired_data = np.array(desired_data).reshape(-1, 1)
    desired_data_normalized = MLRscaler.transform(desired_data)  # Convert to a 2D array
    desired_data_normalized = desired_data_normalized.reshape(1, -1)

    # Use the trained linear regression model to make predictions
    predicted_normalized_value = model.predict(desired_data_normalized)

    predicted_original_scale = MLRscaler.inverse_transform(predicted_normalized_value)

    return predicted_original_scale



In [345]:
def predict_using_linear_regression(company_to_predict, value_to_predict, date_to_predict):
    linear_regression_model = linear_regression(company_to_predict, value_to_predict, data)
    print(mlr_predict(company_to_predict, date_to_predict, value_to_predict, linear_regression_model))

### LSTM

In [None]:
LSTMscaler = MinMaxScaler()

def lstm(symbol, value, data):
    xData, yData = createData(symbol, value, data)
    
    # training data 0.8, testing data 0.2
    split_range = lambda x: int(len(x) * 0.8)
    x_train, x_test = xData[:split_range(xData)], xData[split_range(xData):]
    y_train, y_test = yData[:split_range(yData)], yData[split_range(yData):]
    
    x_train = LSTMscaler.fit_transform(x_train)
    x_test = LSTMscaler.transform(x_test)

    # Reshape the y data for the scaler
    y_train = np.array(y_train).reshape(-1, 1)
    y_test = np.array(y_test).reshape(-1, 1)

    # Normalize the output data (y_train and y_test)
    y_train = LSTMscaler.fit_transform(y_train)
    y_test = LSTMscaler.transform(y_test)
    
    # Create a model
    model = keras.Sequential()
    model.add(keras.layers.LSTM(100, return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(keras.layers.LSTM(100, return_sequences=False))
    model.add(keras.layers.Dense(25))
    model.add(keras.layers.Dense(1))
    model.summary()    

    # Compile the model and get initial mean_squared_error before training
    model.compile(optimizer='adam', loss='mean_squared_error')
    test_loss = model.evaluate(x_test, y_test)
    print(f'Test loss before training: {test_loss}')

    # Train the model
    model.fit(x_train, y_train, batch_size= 2, epochs=5, validation_data=(x_test, y_test))

    # Get mean_squared_error after training
    test_loss = model.evaluate(x_test, y_test)
    print(f'Test loss after training: {test_loss}')

    y_pred = model.predict(x_test)  # Make predictions on the test data

    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f'Root Mean Squared Error (RMSE): {rmse}')

    return model




In [334]:
def lstm_predict(symbol, desired_date, value, model):
    desired_index = date[desired_date]

    x, y = createData(symbol, value, data)
    desired_data = []
    for row in x:
        if row[0] == desired_index:
            desired_data = row
            break

    # Normalize the input features using the same MinMaxScaler used during training
    desired_data = np.array(desired_data).reshape(-1, 1)
    desired_data_normalized = LSTMscaler.transform(desired_data)  # Convert to a 2D array
    
    # Use the trained LSTM model to make predictions
    predicted_normalized_value = model.predict(desired_data_normalized)

    predicted_original_scale = LSTMscaler.inverse_transform(predicted_normalized_value)

    return predicted_original_scale



In [346]:
def predict_using_lstm(company_to_predict, value_to_predict, date_to_predict):
    lstm_model = lstm(company_to_predict, 'Open', data)
    print(lstm_predict(company_to_predict, date_to_predict, value_to_predict, lstm_model))
