In [None]:
import os
import pandas as pd
import numpy as np
import io
from azure.storage.blob import ContainerClient, BlobClient

# =========================
# CONFIG
# =========================
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONN_STR = os.getenv("AZURE_CONN_STR")
CONTAINER = "stock-data"
MODEL_READY_FOLDER = "model_ready_data"
PREDICTIONS_FOLDER = "predictions"
OUTPUT_FOLDER = "backtest_outputs"
INITIAL_CAPITAL = 100000
TOP_N = 10
REBALANCE_FREQ = "Q"  # Q = Quarterly, M = Monthly, W = Weekly

# =========================
# METRICS
# =========================
def calculate_cagr(df):
    start_val = df["portfolio_value"].iloc[0]
    end_val = df["portfolio_value"].iloc[-1]
    years = (df["date"].iloc[-1] - df["date"].iloc[0]).days / 365.25
    return (end_val / start_val) ** (1 / years) - 1 if years > 0 else np.nan

def calculate_sharpe(df, rf_rate=0.02):
    returns = df["portfolio_value"].pct_change().dropna()
    excess = returns - rf_rate / 252
    return np.sqrt(252) * excess.mean() / excess.std() if excess.std() != 0 else np.nan

def calculate_sortino(df, rf_rate=0.02):
    returns = df["portfolio_value"].pct_change().dropna()
    downside = returns[returns < 0]
    excess = returns - rf_rate / 252
    return np.sqrt(252) * excess.mean() / downside.std() if downside.std() != 0 else np.nan

def calculate_mdd(df):
    roll_max = df["portfolio_value"].cummax()
    drawdown = (df["portfolio_value"] - roll_max) / roll_max
    return drawdown.min()

# =========================
# AZURE BLOB FUNCTIONS
# =========================
container_client = ContainerClient.from_connection_string(AZURE_CONN_STR, CONTAINER)

def read_parquet_from_blob(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    data = blob_client.download_blob().readall()
    return pd.read_parquet(io.BytesIO(data))

def upload_to_blob(local_path, blob_path):
    blob_client = container_client.get_blob_client(blob_path)
    with open(local_path, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)
    print(f"📤 Uploaded to Blob: {blob_path}")

# =========================
# GET PRICE FILES MAP
# =========================
all_blobs = list(container_client.list_blobs(name_starts_with=f"{MODEL_READY_FOLDER}/"))
ticker_files = {os.path.basename(b.name).split("_")[0]: b.name for b in all_blobs if b.name.endswith(".parquet")}

# =========================
# LOAD RANKING DATA
# =========================
val_df = read_parquet_from_blob(f"{PREDICTIONS_FOLDER}/final_scored_val.parquet")
test_df = read_parquet_from_blob(f"{PREDICTIONS_FOLDER}/final_scored_test.parquet")
ranking_df = pd.concat([val_df, test_df], ignore_index=True)

ranking_df.rename(columns={
    "Date": "date",
    "Ticker": "ticker",
    "model1_prob_m1": "model1_prob",
    "model2_pred_return_m2": "model2_pred_return"
}, inplace=True)

ranking_df["date"] = pd.to_datetime(ranking_df["date"])
ranking_df.sort_values(["date", "final_score"], ascending=[True, False], inplace=True)

# =========================
# BACKTEST
# =========================
portfolio_value = INITIAL_CAPITAL
portfolio_history = []
trade_signals = []  # one row per holding per rebalance with action+sector
price_cache = {}
sector_cache = {}

available_dates = sorted(ranking_df["date"].unique())

# Determine rebalancing schedule
if REBALANCE_FREQ == "Q":
    rebalance_dates = pd.date_range(start=available_dates[0], end=available_dates[-1], freq="QS")
elif REBALANCE_FREQ == "M":
    rebalance_dates = pd.date_range(start=available_dates[0], end=available_dates[-1], freq="MS")
elif REBALANCE_FREQ == "W":
    rebalance_dates = pd.date_range(start=available_dates[0], end=available_dates[-1], freq="W")
else:
    rebalance_dates = pd.to_datetime(available_dates)

current_holdings = []

for i, rebalance_date in enumerate(rebalance_dates):
    # Nearest available trading date
    day_df = ranking_df[ranking_df["date"] >= rebalance_date]
    if day_df.empty:
        continue
    rebalance_actual_date = day_df["date"].min()

    # Pick top N tickers
    top_stocks = day_df[day_df["date"] == rebalance_actual_date].sort_values("final_score", ascending=False).head(TOP_N)
    new_holdings = top_stocks["ticker"].tolist()

    buys = [t for t in new_holdings if t not in current_holdings]
    sells = [t for t in current_holdings if t not in new_holdings]
    holds = [t for t in new_holdings if t in current_holdings]

    # Ensure price+sector info cached
    for ticker in set(new_holdings + sells):
        if ticker not in ticker_files:
            continue
        if ticker not in price_cache:
            df_price = read_parquet_from_blob(ticker_files[ticker])
            df_price.rename(columns={"Date": "date"}, inplace=True)
            df_price["date"] = pd.to_datetime(df_price["date"])
            price_cache[ticker] = df_price
            sector_cache[ticker] = df_price["Sector"].iloc[0] if "Sector" in df_price.columns else "Unknown"

    # Add trade signal rows
    for ticker in buys:
        trade_signals.append({
            "rebalance_date": rebalance_actual_date,
            "ticker": ticker,
            "sector": sector_cache.get(ticker, "Unknown"),
            "action": "BUY"
        })
    for ticker in holds:
        trade_signals.append({
            "rebalance_date": rebalance_actual_date,
            "ticker": ticker,
            "sector": sector_cache.get(ticker, "Unknown"),
            "action": "HOLD"
        })
    for ticker in sells:
        trade_signals.append({
            "rebalance_date": rebalance_actual_date,
            "ticker": ticker,
            "sector": sector_cache.get(ticker, "Unknown"),
            "action": "SELL"
        })

    current_holdings = new_holdings
    equal_allocation = portfolio_value / len(current_holdings) if current_holdings else 0

    # Advance until next rebalance
    next_rebalance_date = (rebalance_dates[i + 1]
                           if i + 1 < len(rebalance_dates)
                           else available_dates[-1])

    current_date = rebalance_actual_date
    while current_date <= next_rebalance_date:
        day_returns = []
        for ticker in current_holdings:
            df_price = price_cache.get(ticker)
            if df_price is None:
                continue

            price_col = next((c for c in ["Adj Close", "adj_close", "Close", "close"] if c in df_price.columns), None)
            if price_col is None:
                continue

            today_idx = df_price["date"].searchsorted(current_date)
            next_idx = today_idx + 1 if today_idx + 1 < len(df_price) else None
            if next_idx is None:
                continue

            today_price = df_price.iloc[today_idx][price_col]
            next_price = df_price.iloc[next_idx][price_col]
            ret = (next_price - today_price) / today_price
            day_returns.append(ret)

        if day_returns:
            portfolio_value *= (1 + np.mean(day_returns))

        portfolio_history.append({"date": current_date, "portfolio_value": portfolio_value})
        current_date += pd.Timedelta(days=1)

# =========================
# RESULTS
# =========================
results_df = pd.DataFrame(portfolio_history).drop_duplicates(subset="date").sort_values("date")
if results_df.empty:
    raise ValueError("Portfolio history is empty — check date alignment.")

cagr = calculate_cagr(results_df)
sharpe = calculate_sharpe(results_df)
sortino = calculate_sortino(results_df)
mdd = calculate_mdd(results_df)

print(f"CAGR: {cagr:.2%}")
print(f"Sharpe Ratio: {sharpe:.2f}")
print(f"Sortino Ratio: {sortino:.2f}")
print(f"Max Drawdown: {mdd:.2%}")

# Save locally
results_df.to_csv("backtest_results.csv", index=False)
pd.DataFrame(trade_signals).to_csv("trade_signals.csv", index=False)
pd.DataFrame({"CAGR": [cagr], "Sharpe": [sharpe], "Sortino": [sortino], "Max_Drawdown": [mdd]}).to_csv("backtest_summary.csv", index=False)

# Upload to Blob
upload_to_blob("backtest_results.csv", f"{OUTPUT_FOLDER}/backtest_results.csv")
upload_to_blob("trade_signals.csv", f"{OUTPUT_FOLDER}/trade_signals.csv")
upload_to_blob("backtest_summary.csv", f"{OUTPUT_FOLDER}/backtest_summary.csv")

print("✅ Backtest complete & uploaded to Blob.")


CAGR: 55.29%
Sharpe Ratio: 1.18
Sortino Ratio: 1.68
Max Drawdown: -48.64%
📤 Uploaded to Blob: backtest_outputs/backtest_results.csv
📤 Uploaded to Blob: backtest_outputs/trade_signals.csv
📤 Uploaded to Blob: backtest_outputs/backtest_summary.csv
✅ Backtest complete & uploaded to Blob.
