<a href="https://colab.research.google.com/github/faseenafarz/machine-learning/blob/main/stock_price_forecasting_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import ttest_ind, f_oneway
import os

In [5]:
# Load dataset
df = pd.read_csv('/content/yahoo_stock.csv')
df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-11-23,2095.610107,2081.389893,2089.409912,2086.590088,3587980000.0,2086.590088
1,2015-11-24,2094.120117,2070.290039,2084.419922,2089.139893,3884930000.0,2089.139893
2,2015-11-25,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
3,2015-11-26,2093.0,2086.300049,2089.300049,2088.870117,2852940000.0,2088.870117
4,2015-11-27,2093.290039,2084.129883,2088.820068,2090.110107,1466840000.0,2090.110107


In [6]:
#  Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [7]:
# Set Date as index
df.set_index('Date', inplace=True)

In [8]:
#  Check for missing values
df.isnull().sum()

Unnamed: 0,0
High,0
Low,0
Open,0
Close,0
Volume,0
Adj Close,0


In [9]:
# EDA: Composition
# Plotting closing price over time
fig = px.line(df, x=df.index, y='Close', title='Closing Price Over Time')
fig.update_layout(title_text='Closing Price Over Time', title_x=0.5, template='plotly_dark')
fig.show()

In [10]:
# EDA: Distribution
fig = make_subplots(rows=3, cols=2, subplot_titles=('Close', 'Volume', 'Open', 'High', 'Low', 'Adj Close'))
fig.add_trace(go.Histogram(x=df['Close'], name='Close'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Volume'], name='Volume'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Open'], name='Open'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['High'], name='High'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['Low'], name='Low'), row=3, col=1)
fig.add_trace(go.Histogram(x=df['Adj Close'], name='Adj Close'), row=3, col=2)
fig.update_layout(title_text='Distribution of Features', title_x=0.5, template='plotly_dark')
fig.show()

In [11]:
# EDA: Relationship
fig = px.scatter_matrix(df, dimensions=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], title='Scatter Matrix')
fig.update_layout(title_text='Scatter Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [12]:
# EDA: Comparison
# Correlation matrix
correlation = df.corr()
fig = px.imshow(correlation, text_auto=True, title='Correlation Matrix')
fig.update_layout(title_text='Correlation Matrix', title_x=0.5, template='plotly_dark')
fig.show()

In [13]:
# Statistical Tests
# T-test comparing 'High' and 'Low' prices
t_stat, p_value = ttest_ind(df['High'], df['Low'])
t_test_result = {
    'Statistic': [t_stat],
    'p-value': [p_value]
}
t_test_df = pd.DataFrame(t_test_result)
t_test_df

Unnamed: 0,Statistic,p-value
0,2.070802,0.038447


In [14]:
# Display T-test result
fig = px.bar(t_test_df, x=t_test_df.index, y=['Statistic', 'p-value'], barmode='group', title='T-test Result: High vs Low Prices')
fig.update_layout(title_text='T-test Result: High vs Low Prices', title_x=0.5, template='plotly_dark')
fig.show()

In [15]:
# ANOVA test for 'Open', 'High', 'Low', 'Close' prices
anova_stat, anova_p_value = f_oneway(df['Open'], df['High'], df['Low'], df['Close'])
anova_result = {
    'Statistic': [anova_stat],
    'p-value': [anova_p_value]
}
anova_df = pd.DataFrame(anova_result)
anova_df

Unnamed: 0,Statistic,p-value
0,1.432399,0.231203


In [16]:
# Display ANOVA result
fig = px.bar(anova_df, x=anova_df.index, y=['Statistic', 'p-value'], barmode='group', title='ANOVA Test Result: Open, High, Low, Close Prices')
fig.update_layout(title_text='ANOVA Test Result: Open, High, Low, Close Prices', title_x=0.5, template='plotly_dark')
fig.show()

In [17]:
# Data preprocessing: Feature scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df['Close'].values.reshape(-1, 1))

In [18]:
# Prepare the data for LSTM
def create_dataset(data, time_step=1):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]
        X.append(a)
        y.append(data[i + time_step, 0])
    return np.array(X), np.array(y)

# Define time step
time_step = 60

In [19]:
# Create training and test sets
train_size = int(len(scaled_data) * 0.8)
test_size = len(scaled_data) - train_size
train_data, test_data = scaled_data[0:train_size, :], scaled_data[train_size:len(scaled_data), :]

In [22]:
# Create dataset for LSTM
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

In [23]:
# Reshape input to be [samples, time steps, features] which is required for LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [25]:
from tensorflow.keras.layers import Input

In [26]:
# Build the LSTM model

model = Sequential()
model.add(Input(shape=(time_step, 1)))  # Define input shape here
model.add(LSTM(50, return_sequences=True)) # Remove input_shape from here

model.add(Dropout(0.2))
model.add(LSTM(50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [27]:
# Train the model
history = model.fit(X_train, y_train, batch_size=1, epochs=10)

Epoch 1/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 45ms/step - loss: 0.0077
Epoch 2/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 43ms/step - loss: 0.0017
Epoch 3/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 42ms/step - loss: 0.0014
Epoch 4/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 42ms/step - loss: 0.0012
Epoch 5/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 41ms/step - loss: 0.0014
Epoch 6/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 42ms/step - loss: 0.0011
Epoch 7/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 41ms/step - loss: 0.0011
Epoch 8/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 41ms/step - loss: 9.5460e-04
Epoch 9/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 41ms/step - loss: 0.0010
Epoch 10/10
[1m1399/1399[0m [32m━━━━━━━━━━━━━━━━

In [28]:
# Predictions
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [29]:
# Inverse transform to get actual values
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
y_train = scaler.inverse_transform([y_train])
y_test = scaler.inverse_transform([y_test])

In [30]:
# Calculate RMSE and MAE
train_rmse = np.sqrt(mean_squared_error(y_train[0], train_predict[:, 0]))
train_mae = mean_absolute_error(y_train[0], train_predict[:, 0])
test_rmse = np.sqrt(mean_squared_error(y_test[0], test_predict[:, 0]))
test_mae = mean_absolute_error(y_test[0], test_predict[:, 0])

print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Train RMSE: 25.827833337857943, Train MAE: 20.047682810545485
Test RMSE: 75.82993678871858, Test MAE: 60.52136792634663


In [31]:
# Plotting the results
train_data_len = len(train_data)

train_plot = np.empty_like(scaled_data)
train_plot[:, :] = np.nan
train_plot[time_step:len(train_predict)+time_step, :] = train_predict

test_plot = np.empty_like(scaled_data)
test_plot[:, :] = np.nan
test_plot[len(train_predict)+(time_step*2)+1:len(scaled_data)-1, :] = test_predict

In [32]:
# Create plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=df.index, y=train_plot[:, 0], mode='lines', name='Train Predict', line=dict(color='green')))
fig.add_trace(go.Scatter(x=df.index, y=test_plot[:, 0], mode='lines', name='Test Predict', line=dict(color='red')))
fig.update_layout(title='Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()

In [33]:
def predict_future(model, data, scaler, time_step=60, future_steps=30):
    # Prepare the input data for future prediction
    last_data = data[-time_step:]
    last_data = last_data.reshape(1, time_step, 1)

    # Predict future prices
    future_predictions = []
    for _ in range(future_steps):
        next_pred = model.predict(last_data)
        future_predictions.append(next_pred[0, 0])
        last_data = np.append(last_data[:, 1:, :], [[[next_pred[0, 0]]]], axis=1)

    # Inverse transform the predictions to get actual values
    future_predictions = np.array(future_predictions).reshape(-1, 1)
    future_predictions = scaler.inverse_transform(future_predictions)

    return future_predictions

# Example usage
future_steps = 30
future_predictions = predict_future(model, scaled_data, scaler, time_step, future_steps)

# Create future dates for plotting
last_date = df.index[-1]
future_dates = pd.date_range(start=last_date, periods=future_steps + 1, inclusive='right')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43

In [34]:
# Plotting future predictions
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Close'], mode='lines', name='Actual Price', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=future_dates, y=future_predictions[:, 0], mode='lines', name='Future Predict', line=dict(color='orange')))
fig.update_layout(title='Future Stock Price Prediction',
                   xaxis_title='Date',
                   yaxis_title='Stock Price',
                   template='plotly_dark')
fig.show()