In [1]:
# Notebook-level imports
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras



from sklearn.preprocessing import MinMaxScaler

# reproducibility
tf.random.set_seed(42)
np.random.seed(42)

In [2]:
# 1. Download historical OHLCV
TICKER    = "GOOG"
START     = "2012-01-01"
END       = "2022-12-21"
df        = yf.download(TICKER, START, END)
df.reset_index(inplace=True)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


In [3]:
# 2. Compute moving averages
for w in (10, 50, 100, 200):
    df[f"MA_{w}"] = df["Close"].rolling(w).mean()


In [4]:
# 3. Compute RSI (14-day)
delta = df["Close"].diff()
gain  = delta.clip(lower=0)
loss  = -delta.clip(upper=0)
avg_gain = gain.rolling(14).mean()
avg_loss = loss.rolling(14).mean()
rs = avg_gain / avg_loss
df["RSI_14"] = 100 - (100 / (1 + rs))


In [6]:
# 4. Drop NaNs
df.dropna(inplace=True)
df.head()
# keep only features + target
features = ["Close", "MA_10","MA_50","MA_100","MA_200","RSI_14"]
data     = df[features].copy()

# 80/20 split
split_idx      = int(len(data) * 0.8)
train_df       = data.iloc[:split_idx]
test_df        = data.iloc[split_idx - 100:]   # overlap last 100 for sequences


In [7]:
# scale all features to [0,1]
scaler  = MinMaxScaler()
train_scaled = scaler.fit_transform(train_df)
test_scaled  = scaler.transform(test_df)
def make_sequences(data_array, n_steps=100):
    X, y = [], []
    for i in range(n_steps, len(data_array)):
        X.append(data_array[i-n_steps:i])
        y.append(data_array[i, 0])         # predict scaled Close
    return np.array(X), np.array(y)

N_STEPS = 100
X_train, y_train = make_sequences(train_scaled, N_STEPS)
X_test,  y_test  = make_sequences(test_scaled,  N_STEPS)

print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Shapes: (1949, 100, 6) (1949,) (513, 100, 6) (513,)


In [8]:
model = keras.models.Sequential([
    keras.layers.LSTM(64,  activation="tanh", return_sequences=True, input_shape=(N_STEPS, X_train.shape[2])),
    keras.layers.Dropout(0.2),
    
    keras.layers.LSTM(128, activation="tanh", return_sequences=False),
    keras.layers.Dropout(0.3),
    
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1,  activation="linear")
])

model.compile(
    optimizer="adam",
    loss="mean_squared_error",
    metrics=["mean_absolute_error"]
)

# callbacks: early stop + best-model checkpoint
es = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
ck = keras.callbacks.ModelCheckpoint("best_stock_model.h5", save_best_only=True, monitor="val_loss")

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=100,
    batch_size=32,
    callbacks=[es, ck],
    verbose=1
)

model.summary()


Epoch 1/100


  super().__init__(**kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - loss: 0.0281 - mean_absolute_error: 0.1114



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 135ms/step - loss: 0.0278 - mean_absolute_error: 0.1107 - val_loss: 0.0132 - val_mean_absolute_error: 0.1010
Epoch 2/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 0.0056 - mean_absolute_error: 0.0524



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 127ms/step - loss: 0.0056 - mean_absolute_error: 0.0523 - val_loss: 0.0053 - val_mean_absolute_error: 0.0583
Epoch 3/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - loss: 0.0047 - mean_absolute_error: 0.0478



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 110ms/step - loss: 0.0047 - mean_absolute_error: 0.0478 - val_loss: 0.0038 - val_mean_absolute_error: 0.0489
Epoch 4/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 98ms/step - loss: 0.0036 - mean_absolute_error: 0.0428 - val_loss: 0.0057 - val_mean_absolute_error: 0.0661
Epoch 5/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - loss: 0.0039 - mean_absolute_error: 0.0436 - val_loss: 0.0039 - val_mean_absolute_error: 0.0511
Epoch 6/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 0.0030 - mean_absolute_error: 0.0381



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 101ms/step - loss: 0.0030 - mean_absolute_error: 0.0381 - val_loss: 0.0029 - val_mean_absolute_error: 0.0388
Epoch 7/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - loss: 0.0030 - mean_absolute_error: 0.0376 - val_loss: 0.0037 - val_mean_absolute_error: 0.0507
Epoch 8/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 122ms/step - loss: 0.0028 - mean_absolute_error: 0.0368 - val_loss: 0.0031 - val_mean_absolute_error: 0.0463
Epoch 9/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 139ms/step - loss: 0.0027 - mean_absolute_error: 0.0363 - val_loss: 0.0057 - val_mean_absolute_error: 0.0665
Epoch 10/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 145ms/step - loss: 0.0027 - mean_absolute_error: 0.0352 - val_loss: 0.0079 - val_mean_absolute_error: 0.0797
Epoch 11/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 146ms/ste



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 168ms/step - loss: 0.0022 - mean_absolute_error: 0.0323 - val_loss: 0.0023 - val_mean_absolute_error: 0.0344
Epoch 14/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 175ms/step - loss: 0.0022 - mean_absolute_error: 0.0340 - val_loss: 0.0068 - val_mean_absolute_error: 0.0731
Epoch 15/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 181ms/step - loss: 0.0020 - mean_absolute_error: 0.0308 - val_loss: 0.0034 - val_mean_absolute_error: 0.0495
Epoch 16/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 126ms/step - loss: 0.0018 - mean_absolute_error: 0.0295 - val_loss: 0.0023 - val_mean_absolute_error: 0.0357
Epoch 17/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 128ms/step - loss: 0.0017 - mean_absolute_error: 0.0288 - val_loss: 0.0070 - val_mean_absolute_error: 0.0742
Epoch 18/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 112m



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 113ms/step - loss: 0.0016 - mean_absolute_error: 0.0286 - val_loss: 0.0022 - val_mean_absolute_error: 0.0361
Epoch 22/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/step - loss: 0.0017 - mean_absolute_error: 0.0306 - val_loss: 0.0024 - val_mean_absolute_error: 0.0406
Epoch 23/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step - loss: 0.0014 - mean_absolute_error: 0.0274 - val_loss: 0.0045 - val_mean_absolute_error: 0.0578
Epoch 24/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/step - loss: 0.0014 - mean_absolute_error: 0.0270 - val_loss: 0.0034 - val_mean_absolute_error: 0.0488
Epoch 25/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - loss: 0.0015 - mean_absolute_error: 0.0274



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step - loss: 0.0015 - mean_absolute_error: 0.0273 - val_loss: 0.0022 - val_mean_absolute_error: 0.0370
Epoch 26/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step - loss: 0.0013 - mean_absolute_error: 0.0260 - val_loss: 0.0067 - val_mean_absolute_error: 0.0732
Epoch 27/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/step - loss: 0.0014 - mean_absolute_error: 0.0265 - val_loss: 0.0040 - val_mean_absolute_error: 0.0549
Epoch 28/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 107ms/step - loss: 0.0013 - mean_absolute_error: 0.0260 - val_loss: 0.0023 - val_mean_absolute_error: 0.0388
Epoch 29/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 111ms/step - loss: 0.0013 - mean_absolute_error: 0.0262 - val_loss: 0.0032 - val_mean_absolute_error: 0.0485
Epoch 30/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 113ms/step - loss: 0.0012 - mean_absolute_error: 0.0247 - val_loss: 0.0018 - val_mean_absolute_error: 0.0329
Epoch 36/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/step - loss: 0.0014 - mean_absolute_error: 0.0263 - val_loss: 0.0035 - val_mean_absolute_error: 0.0511
Epoch 37/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - loss: 0.0012 - mean_absolute_error: 0.0255



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 112ms/step - loss: 0.0012 - mean_absolute_error: 0.0255 - val_loss: 0.0016 - val_mean_absolute_error: 0.0302
Epoch 38/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step - loss: 0.0013 - mean_absolute_error: 0.0269 - val_loss: 0.0018 - val_mean_absolute_error: 0.0329
Epoch 39/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 108ms/step - loss: 0.0014 - mean_absolute_error: 0.0277 - val_loss: 0.0018 - val_mean_absolute_error: 0.0329
Epoch 40/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 112ms/step - loss: 0.0012 - mean_absolute_error: 0.0254 - val_loss: 0.0021 - val_mean_absolute_error: 0.0375
Epoch 41/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 110ms/step - loss: 0.0011 - mean_absolute_error: 0.0244 - val_loss: 0.0020 - val_mean_absolute_error: 0.0358
Epoch 42/100
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 113ms/

In [9]:
# ——— 7. Load Best Model & Predict on Test Set ———
# (ModelCheckpoint already saved the best weights to "best_stock_model.h5")
model.load_weights("best_stock_model.h5")

# 1. Predict (still scaled)
y_pred_scaled = model.predict(X_test)             # shape (n_samples, 1)

# 2. Invert scaling for the 'Close' feature (index 0)
#    data_min_ & data_range_ come from the MinMaxScaler on training data
data_min   = scaler.data_min_[0]
data_range = scaler.data_range_[0]

y_pred = y_pred_scaled.flatten() * data_range + data_min
y_actual = y_test * data_range + data_min

# 3. Compute evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_actual, y_pred)
mae = mean_absolute_error(y_actual, y_pred)
r2  = r2_score(y_actual, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R² : {r2:.4f}")

# ——— 8. Prepare Predictions DataFrame for Streamlit ———
# Map the predictions back to dates: 
# since test sequences started at df index split_idx, we align with df["Date"].iloc[split_idx:]
pred_dates = df["Date"].iloc[split_idx:].reset_index(drop=True)

results = pd.DataFrame({
    "Date":           pred_dates,
    "Actual_Close":   y_actual,
    "Predicted_Close": y_pred
})

# Peek at the first few rows
results.head()


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 87ms/step
Test MSE: 81.9145
Test MAE: 7.4690
Test R² : 0.7546


Unnamed: 0,Date,Actual_Close,Predicted_Close
0,2020-12-08,90.498856,88.9323
1,2020-12-09,88.785965,89.089884
2,2020-12-10,88.348038,89.079639
3,2020-12-11,88.668526,88.868651
4,2020-12-14,87.588142,88.879111
