In [None]:
# Paths assume the repo root is the working directory.
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# --- Project paths
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA = ROOT / "data"
DATA_RAW = DATA / "raw"
DATA_PROC = DATA / "processed"
RESULT = ROOT / "result"
FIG_DIR = RESULT / "figures"
FINAL_DIR = RESULT / "final"
CSV_DIR = RESULT / "csv"

for p in [FIG_DIR, FINAL_DIR, CSV_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("Root:", ROOT)
print("Data:", DATA)
print("Saving figures to:", FIG_DIR)
print("Saving final artifacts to:", FINAL_DIR)
print("Saving CSVs to:", CSV_DIR)

# --- Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from datetime import timedelta

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import pmdarima as pm
from statsmodels.tsa.statespace.sarimax import SARIMAX

import yfinance as yf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
sns.set(style="whitegrid", palette="muted")


In [None]:
TICKER = "TSLA"
TARGET_COL = "Close"

# Forecast horizon (months) — change to 6 or 12 as needed
FORECAST_MONTHS = 12
TRADING_DAYS_PER_MONTH = 21
H = FORECAST_MONTHS * TRADING_DAYS_PER_MONTH  # forecast steps

# LSTM hyperparameters
TIME_STEP = 60
EPOCHS = 20
BATCH_SIZE = 32

print(f"Forecasting {FORECAST_MONTHS} months (~{H} trading days)")


In [None]:
def load_tsla():
    # 1) processed file
    proc = DATA_PROC / "TSLA_clean.csv"
    if proc.exists():
        df = pd.read_csv(proc, parse_dates=["Date"])
        df = df.sort_values("Date").reset_index(drop=True)
        return df

    # 2) raw file
    raw = DATA_RAW / "TSLA_raw.csv"
    if raw.exists():
        df = pd.read_csv(raw, parse_dates=["Date"])
        df = df.sort_values("Date").reset_index(drop=True)
        return df

    # 3) fallback: download
    print("Local CSV not found. Downloading TSLA via yfinance...")
    df = yf.download(TICKER, start="2015-01-01", progress=False).reset_index()
    df = df.rename(columns={"Adj Close": "AdjClose"})
    df.to_csv(raw, index=False)
    print("Saved:", raw)
    return df

df = load_tsla()
assert {"Date", TARGET_COL}.issubset(df.columns), f"Expected columns 'Date' & '{TARGET_COL}' in dataframe."
df = df[["Date", TARGET_COL]].dropna()
df.tail()


In [None]:
plt.figure(figsize=(12,5))
plt.plot(df["Date"], df[TARGET_COL], label=f"{TICKER} Close")
plt.title(f"{TICKER} Price History")
plt.xlabel("Date"); plt.ylabel("Price")
plt.legend(); plt.tight_layout()
plt.show()


In [None]:
series = df[TARGET_COL].astype(float).values

# Auto ARIMA to find (p,d,q)
arima_model = pm.auto_arima(
    series,
    seasonal=False,
    stepwise=True,
    suppress_warnings=True,
    error_action="ignore",
    maxiter=50
)
print(arima_model.summary())

# Forecast H steps with 95% CI
arima_fc, arima_ci = arima_model.predict(n_periods=H, return_conf_int=True, alpha=0.05)

# Build future date index (business days aligned to daily freq using last date)
last_date = df["Date"].iloc[-1]
future_dates = pd.bdate_range(last_date + pd.Timedelta(days=1), periods=H)

arima_future = pd.DataFrame({
    "Date": future_dates,
    "ARIMA_Forecast": arima_fc,
    "ARIMA_Lower": arima_ci[:,0],
    "ARIMA_Upper": arima_ci[:,1],
})

arima_path = CSV_DIR / f"arima_future_{FORECAST_MONTHS}m.csv"
arima_future.to_csv(arima_path, index=False)
print("Saved ARIMA future:", arima_path)
arima_future.head()


In [None]:
# Scale on full series
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(series.reshape(-1,1))

def make_sequences(data, time_step=60):
    X, y = [], []
    for i in range(len(data) - time_step):
        X.append(data[i:(i+time_step), 0])
        y.append(data[i + time_step, 0])
    X = np.array(X); y = np.array(y)
    return X.reshape(X.shape[0], X.shape[1], 1), y

X, y = make_sequences(scaled, TIME_STEP)

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(TIME_STEP,1)),
    LSTM(64, return_sequences=False),
    Dense(32),
    Dense(1)
])
model.compile(optimizer="adam", loss="mean_squared_error")
hist = model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

# Recursive forecast
last_window = scaled[-TIME_STEP:].copy().reshape(1, TIME_STEP, 1)
lstm_preds_scaled = []

for _ in range(H):
    pred = model.predict(last_window, verbose=0)
    lstm_preds_scaled.append(pred[0,0])
    last_window = np.append(last_window[:,1:,:], pred.reshape(1,1,1), axis=1)

lstm_preds = scaler.inverse_transform(np.array(lstm_preds_scaled).reshape(-1,1)).flatten()

lstm_future = pd.DataFrame({
    "Date": pd.bdate_range(df["Date"].iloc[-1] + pd.Timedelta(days=1), periods=H),
    "LSTM_Forecast": lstm_preds
})

lstm_path = CSV_DIR / f"lstm_future_{FORECAST_MONTHS}m.csv"
lstm_future.to_csv(lstm_path, index=False)
print("Saved LSTM future:", lstm_path)
lstm_future.head()


In [None]:
# Limit history to last ~2 years for clarity
hist_mask = df["Date"] >= (df["Date"].max() - pd.Timedelta(days=365*2))
hist_df = df.loc[hist_mask].copy()

plt.figure(figsize=(14,6))
plt.plot(hist_df["Date"], hist_df[TARGET_COL], label="Historical (last ~2y)", linewidth=1.5, color="gray")

# ARIMA with CI
plt.plot(arima_future["Date"], arima_future["ARIMA_Forecast"], label="ARIMA Forecast", linewidth=2, color="tab:green")
plt.fill_between(
    arima_future["Date"].values,
    arima_future["ARIMA_Lower"].values,
    arima_future["ARIMA_Upper"].values,
    alpha=0.2, label="ARIMA 95% CI", color="tab:green"
)

# LSTM line
plt.plot(lstm_future["Date"], lstm_future["LSTM_Forecast"], label="LSTM Forecast", linewidth=2, color="tab:red")

plt.title(f"{TICKER} — {FORECAST_MONTHS}-Month Forecast (ARIMA vs LSTM)")
plt.xlabel("Date"); plt.ylabel("Price")
plt.legend()
plt.tight_layout()

fig_path = FIG_DIR / f"forecast_comparison_{FORECAST_MONTHS}m.png"
plt.savefig(fig_path, dpi=160)
plt.show()
print("Saved figure:", fig_path)


In [None]:
# --- Trend slope (simple linear fit) on each forecast
x_ar = np.arange(len(arima_future))
slope_arima = np.polyfit(x_ar, arima_future["ARIMA_Forecast"].values, 1)[0]

x_ls = np.arange(len(lstm_future))
slope_lstm = np.polyfit(x_ls, lstm_future["LSTM_Forecast"].values, 1)[0]

# --- ARIMA CI width stats
ci_width = arima_future["ARIMA_Upper"] - arima_future["ARIMA_Lower"]
ci_stats = {
    "mean_width": float(ci_width.mean()),
    "median_width": float(ci_width.median()),
    "min_width": float(ci_width.min()),
    "max_width": float(ci_width.max()),
    "start_width": float(ci_width.iloc[0]),
    "end_width": float(ci_width.iloc[-1]),
    "pct_increase_start_to_end": float((ci_width.iloc[-1] - ci_width.iloc[0]) / ci_width.iloc[0] * 100 if ci_width.iloc[0] != 0 else np.nan)
}

# --- Textual analysis
trend_arima = "upward" if slope_arima > 0 else ("downward" if slope_arima < 0 else "flat")
trend_lstm  = "upward" if slope_lstm  > 0 else ("downward" if slope_lstm  < 0 else "flat")

analysis_lines = []
analysis_lines.append(f"Ticker: {TICKER}")
analysis_lines.append(f"Forecast horizon: {FORECAST_MONTHS} months (~{H} trading days)")
analysis_lines.append("")
analysis_lines.append("Trend Analysis:")
analysis_lines.append(f" - ARIMA trend: {trend_arima} (slope={slope_arima:.4f})")
analysis_lines.append(f" - LSTM  trend: {trend_lstm}  (slope={slope_lstm:.4f})")
analysis_lines.append("")
analysis_lines.append("Volatility & Uncertainty (ARIMA 95% CI):")
analysis_lines.append(f" - Mean CI width:  {ci_stats['mean_width']:.2f}")
analysis_lines.append(f" - Median CI width:{ci_stats['median_width']:.2f}")
analysis_lines.append(f" - Start width:    {ci_stats['start_width']:.2f}")
analysis_lines.append(f" - End width:      {ci_stats['end_width']:.2f}")
analysis_lines.append(f" - Min/Max width:  {ci_stats['min_width']:.2f} / {ci_stats['max_width']:.2f}")
analysis_lines.append(f" - % change (start→end): {ci_stats['pct_increase_start_to_end']:.2f}%")
analysis_lines.append("")
analysis_lines.append("Interpretation:")
analysis_lines.append(" - Confidence intervals typically widen across the horizon, reflecting increasing uncertainty in longer-term forecasts.")
analysis_lines.append(" - Wider CIs imply lower reliability for point estimates farther into the future; use ranges instead of single values for decisions.")
analysis_lines.append("")
analysis_lines.append("Opportunities & Risks (Illustrative):")
if slope_arima > 0 or slope_lstm > 0:
    analysis_lines.append(" - Opportunity: Upward trend suggests potential price appreciation if macro/firm-specific conditions remain supportive.")
else:
    analysis_lines.append(" - Risk: Downward/flat trend may indicate headwinds or limited upside; caution on long exposure.")

analysis_lines.append(" - Risk: CI widening highlights scenario risk; consider hedging and rebalancing thresholds.")
analysis_lines.append(" - Consider sensitivity to shocks (earnings, rate changes, macro events).")

summary_txt = "\n".join(analysis_lines)
summary_path = FINAL_DIR / f"forecast_analysis_summary_{FORECAST_MONTHS}m.txt"
with open(summary_path, "w") as f:
    f.write(summary_txt)

print(summary_txt)
print("\nSaved summary:", summary_path)


In [None]:
combined = pd.merge(
    arima_future.rename(columns={
        "ARIMA_Forecast":"ARIMA",
        "ARIMA_Lower":"ARIMA_Lower",
        "ARIMA_Upper":"ARIMA_Upper"
    }),
    lstm_future.rename(columns={"LSTM_Forecast":"LSTM"}),
    on="Date",
    how="outer"
).sort_values("Date")

combined_path = CSV_DIR / f"forecast_combined_{FORECAST_MONTHS}m.csv"
combined.to_csv(combined_path, index=False)
print("Saved combined forecast CSV:", combined_path)
combined.head()
