In [103]:
import numpy as np
import pandas as pd
from pymongo import MongoClient

In [104]:
# Function to pull data from MongoDB and convert it to DataFrames
def pulling_data_db():
    # Connect to MongoDB
    client = MongoClient('localhost', 27017)
    db = client['financeStockData']

    collection = db['AAPL_stock_data']
    
    # Fetch all documents from each collection
    dt1 = list(collection.find())     # Convert cursor to list for DataFrame
 
    # Convert list of documents (dictionaries) to pandas DataFrames
    appl_data = pd.DataFrame(dt1)
    
    return appl_data # tesl_fin, goo_fin

In [105]:
fin_data = pulling_data_db()
fin_data.head()

Unnamed: 0,_id,Date,Open,High,Low,Close,Adj Close,Volume
0,671cb50b7e1d0aa4a14f6c7c,2023-10-27,166.910004,168.960007,166.830002,168.220001,167.363586,58499100
1,671cb50b7e1d0aa4a14f6c7d,2023-10-30,169.020004,171.169998,168.869995,170.289993,169.423035,51131000
2,671cb50b7e1d0aa4a14f6c7e,2023-10-31,169.350006,170.899994,167.899994,170.770004,169.900604,44846000
3,671cb50b7e1d0aa4a14f6c7f,2023-11-01,171.0,174.229996,170.119995,173.970001,173.08432,56934900
4,671cb50b7e1d0aa4a14f6c80,2023-11-02,175.520004,177.779999,175.460007,177.570007,176.665985,77334800


In [106]:
fin_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   _id        252 non-null    object 
 1   Date       252 non-null    object 
 2   Open       252 non-null    float64
 3   High       252 non-null    float64
 4   Low        252 non-null    float64
 5   Close      252 non-null    float64
 6   Adj Close  252 non-null    float64
 7   Volume     252 non-null    int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 15.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   _id        252 non-null    object 
 1   Date       252 non-null    object 
 2   Open       252 non-null    float64
 3   High       252 non-null    float64
 4   Low        252 non-null    float64
 5   Close      252 non-null    float64
 6   Adj 

In [107]:
fin_data = fin_data[["Date", "Adj Close"]]
fin_data.plot(kind='line', title = "checking for stationarity");

In [108]:
fin_data.index = pd.to_datetime(fin_data.index)

In [109]:
# Average stock prices per month, week and day
weekly_price = fin_data['Adj Close'].resample('W').mean()
monthly_price = fin_data['Adj Close'].resample('M').mean()
daily_price = fin_data['Adj Close'].resample('D').mean()

print(f"Weekly average price of stock is {weekly_price}")
print(f"Monthly average price of stock is {monthly_price}")
print(f"Daily average price of stock is {daily_price}")

In [110]:
fin_data['Date'] = pd.to_datetime(fin_data['Date'])
fin_data['month'] = fin_data['Date'].dt.month
fin_data['day'] = fin_data['Date'].dt.day
fin_data['year'] = fin_data['Date'].dt.year
fin_data.head()

In [111]:
fin_data = fin_data.drop(columns = ['Date'], axis =1)
fin_data.head()

In [112]:
fin_data.duplicated().sum()

In [113]:
fin_data['Adj Close'].describe()

In [114]:
df = fin_data.values.flatten()

In [115]:
from statsmodels.tsa.stattools import adfuller

def adf_test(timeseries):

    print('Results of Dickey-Fuller Test:')
    adf_test_result = adfuller(timeseries, autolag='AIC')
    
    adf_output = pd.Series(adf_test_result[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in adf_test_result[4].items():
        adf_output['Critical Value (%s)' % key] = value
    
    print(adf_output)

    # Interpret the p-value
    if adf_output['p-value'] < 0.05:
        print("\nConclusion: Reject the null hypothesis. The time series is stationary.")
    else:
        print("\nConclusion: Cannot reject the null hypothesis. The time series is non-stationary.")


In [116]:
adf_test(df)

In [117]:
season_length = 3
df_diff = fin_data['Adj Close'].diff(periods=season_length).dropna()  # Adjust season_length accordingly
df_diff

In [118]:
# df = pd.DataFrame(df_diff)
df_diff.plot(kind="line")

In [119]:
adf_test(df_diff)

In [120]:
df_smoothed = df_diff.rolling(window=2).mean()  # Adjust window size
df_smoothed.dropna(inplace=True)

In [121]:
# Checking for trend in the time series data
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

decomposition = seasonal_decompose(df_smoothed, model='additive', period=12)  # Adjust 'period'

# Plot the decomposed components
decomposition.plot()
plt.show()

In [122]:
decomposition.trend

In [123]:
df_smoothed.plot(kind='line', title="Stationarity plot");

In [124]:
# Normalizing data
from sklearn.preprocessing import StandardScaler

def scaling_data(data):
    scale = StandardScaler()
    
    normalized_data = scale.fit_transform(data)
    
    return normalized_data

In [125]:
reshap = np.array(df_smoothed).reshape(-1, 1)
scalled = scaling_data(reshap)

df = pd.DataFrame(scalled)

In [126]:
df.head()

In [127]:
X_train = df.loc[0:199, ]
X_test = df.loc[199:, ]

In [128]:
print(X_train.shape)
print(X_test.shape)

In [129]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [130]:
# Using multi-step forecasting method

sequence_length = 30
forecast_horizon = 7

def create_sequence(data, sequence_length, horizon):
    
    X, y = [], []
    
    for x in range(len(data) - sequence_length - horizon +1):
        X.append(data[x:x + sequence_length])
        y.append(data[x + sequence_length:x+season_length+horizon])
    
    return np.array(X), np.array(y)

In [131]:
# Assigning data to X and Y
X, y = create_sequence(X_train, sequence_length, forecast_horizon)

In [134]:
# Splitting the data into train and test using 80% for training and 20% for testing
split = int(0.8 * len(X))
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

In [135]:
model = Sequential()

model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=50))

model.add(Dense(y_train.shape[1]))  # Output units match the target dimension

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)