In [None]:
# =========================
# Training Data Preparation
# (post feature-engineering)
# =========================

import os, io
import pandas as pd
from azure.storage.blob import BlobServiceClient
from sklearn.preprocessing import OneHotEncoder

# ---------- CONFIG ----------
import os
from dotenv import load_dotenv

load_dotenv()  # loads .env file if present (for local dev)

AZURE_CONNECTION_STRING = os.getenv("AZURE_CONN_STR")
CONTAINER_NAME = "stock-data"

INPUT_FOLDER  = "model_ready_data"     # from previous step
OUTPUT_LOCAL  = "training_data"        # local folder to write splits
OUTPUT_BLOB   = "training_data"        # optional: upload splits here

TARGET_HORIZON_DAYS   = 252            # ~12 months ahead
HIGH_GROWTH_THRESHOLD = 0.25           # >=25% next-yr return => class 1
DROP_OUTLIER_FLAGS    = False          # set True if you want to drop *_is_outlier cols
UPLOAD_TO_BLOB        = True          # set True to push splits back to Azure

os.makedirs(OUTPUT_LOCAL, exist_ok=True)

# ---------- CONNECT ----------
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container    = blob_service.get_container_client(CONTAINER_NAME)

# ---------- LOAD ALL MODEL-READY PARQUETS ----------
print("📥 Loading feature-engineered parquet files...")
blob_names = [
    b.name for b in container.list_blobs(name_starts_with=INPUT_FOLDER)
    if b.name.lower().endswith(".parquet")
]

dfs = []
for name in blob_names:
    data = container.download_blob(name).readall()
    df = pd.read_parquet(io.BytesIO(data))
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
# ensure chronological order per ticker
data = data.sort_values(["Ticker", "Date"]).reset_index(drop=True)

# enforce dtypes
data["Ticker"] = data["Ticker"].astype(str)
data["Sector"] = data["Sector"].astype(str)
print(f"✅ Loaded {len(data):,} rows from {len(blob_names)} files.")

# ---------- TARGETS (no leakage) ----------
# future_return = AdjClose[t+252]/AdjClose[t] - 1 (per ticker)
data["future_return"] = (
    data.groupby("Ticker")["Adj Close"].shift(-TARGET_HORIZON_DAYS) / data["Adj Close"] - 1
)
data["high_growth_label"] = (data["future_return"] >= HIGH_GROWTH_THRESHOLD).astype(int)

# drop rows where future_return not available (tail of each ticker)
before = len(data)
data = data.dropna(subset=["future_return"]).reset_index(drop=True)
print(f"🧼 Dropped {before - len(data):,} tail rows with no future label.")

# ---------- OPTIONAL: drop outlier flag columns ----------
if DROP_OUTLIER_FLAGS:
    outlier_cols = [c for c in data.columns if c.endswith("_is_outlier")]
    data = data.drop(columns=outlier_cols)
    print(f"🧹 Dropped {len(outlier_cols)} outlier-flag columns.")

# ---------- ONE-HOT ENCODE SECTOR ----------
from packaging import version
import sklearn
from sklearn.preprocessing import OneHotEncoder

if version.parse(sklearn.__version__) >= version.parse("1.2"):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
else:
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")

sector_mat = ohe.fit_transform(data[["Sector"]])
sector_cols = [f"Sector_{cat}" for cat in ohe.categories_[0]]
sector_df   = pd.DataFrame(sector_mat, columns=sector_cols, index=data.index)

data = pd.concat([data.drop(columns=["Sector"]), sector_df], axis=1)

# ---------- FEATURE LIST ----------
exclude = {"Date", "Ticker", "future_return", "high_growth_label"}
feature_cols = [c for c in data.columns if c not in exclude]
print(f"🧩 Using {len(feature_cols)} features.")
# (Optionally save the feature list for training scripts)
pd.Series(feature_cols, name="features").to_csv(os.path.join(OUTPUT_LOCAL, "feature_columns.csv"), index=False)

# ---------- TIME SPLIT PER TICKER (70/15/15) ----------
def split_by_time(df_t):
    n = len(df_t)
    i1 = int(0.70 * n)
    i2 = int(0.85 * n)
    return df_t.iloc[:i1], df_t.iloc[i1:i2], df_t.iloc[i2:]

train_parts, val_parts, test_parts = [], [], []
for tkr, g in data.groupby("Ticker", sort=False):
    g = g.sort_values("Date")
    tr, va, te = split_by_time(g)
    train_parts.append(tr)
    val_parts.append(va)
    test_parts.append(te)

train_df = pd.concat(train_parts).reset_index(drop=True)
val_df   = pd.concat(val_parts).reset_index(drop=True)
test_df  = pd.concat(test_parts).reset_index(drop=True)

print(f"📊 Splits -> Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

# ---------- SAVE LOCALLY ----------
train_path = os.path.join(OUTPUT_LOCAL, "train.parquet")
val_path   = os.path.join(OUTPUT_LOCAL, "val.parquet")
test_path  = os.path.join(OUTPUT_LOCAL, "test.parquet")

train_df.to_parquet(train_path, index=False)
val_df.to_parquet(val_path, index=False)
test_df.to_parquet(test_path, index=False)

print(f"💾 Saved:\n - {train_path}\n - {val_path}\n - {test_path}")

# ---------- OPTIONAL: UPLOAD TO AZURE ----------
if UPLOAD_TO_BLOB:
    for local_fp, blob_base in [(train_path, "train.parquet"), (val_path, "val.parquet"), (test_path, "test.parquet")]:
        with open(local_fp, "rb") as f:
            blob_name = f"{OUTPUT_BLOB}/{blob_base}"
            container.upload_blob(name=blob_name, data=f, overwrite=True)
            print(f"☁️  Uploaded: {blob_name}")

print("✅ Training data preparation complete.")


📥 Loading feature-engineered parquet files...
✅ Loaded 4,028,304 rows from 403 files.
🧼 Dropped 101,556 tail rows with no future label.
🧩 Using 52 features.
📊 Splits -> Train: (2748516, 56), Val: (589040, 56), Test: (589192, 56)
💾 Saved:
 - training_data/train.parquet
 - training_data/val.parquet
 - training_data/test.parquet
✅ Training data preparation complete.


In [None]:
import pandas as pd

# Paths
train_path = "training_data/train.parquet"
val_path   = "training_data/val.parquet"
test_path  = "training_data/test.parquet"

# Load
train_df = pd.read_parquet(train_path)
val_df   = pd.read_parquet(val_path)
test_df  = pd.read_parquet(test_path)

# Shapes
print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

# Peek at first rows
print("\n--- TRAIN HEAD ---")
print(train_df.head())

print("\n--- VAL HEAD ---")
print(val_df.head())

print("\n--- TEST HEAD ---")
print(test_df.head())

# Check column types
print("\n--- Column dtypes ---")
print(train_df.dtypes)

# Check label distribution
print("\nHigh Growth label counts:")
print(train_df["high_growth_label"].value_counts(normalize=True))
print(val_df["high_growth_label"].value_counts(normalize=True))
print(test_df["high_growth_label"].value_counts(normalize=True))


Train shape: (2748516, 56)
Val shape: (589040, 56)
Test shape: (589192, 56)

--- TRAIN HEAD ---
                       Date       Open       High        Low      Close  \
0 2000-01-06 05:00:00+00:00  44.080830  44.349072  41.577251  42.918453   
1 2000-01-07 05:00:00+00:00  42.247852  47.165592  42.203148  46.494991   
2 2000-01-10 05:00:00+00:00  49.356224  49.803291  48.327969  49.311516   
3 2000-01-11 05:00:00+00:00  49.311516  49.311516  47.523247  48.640915   
4 2000-01-12 05:00:00+00:00  48.640915  48.640915  45.824390  47.657368   

   Adj Close   Volume Ticker     SMA_20     EMA_20  ...  \
0  35.999908  2534434      A  33.254918  34.867515  ...   
1  38.999901  2819626      A  33.830542  35.261076  ...   
2  41.362392  2148446      A  34.556165  35.842153  ...   
3  40.799900  1855985      A  35.231163  36.314320  ...   
4  39.974907  1429874      A  35.939912  36.662947  ...   

   Sector_Consumer Cyclical  Sector_Consumer Defensive  Sector_Energy  \
0                       0

In [None]:
print("\n--- TEST HEAD ---")
print(test_df.tail())


--- TEST HEAD ---
                            Date        Open        High         Low  \
589187 2024-07-31 04:00:00+00:00  357.779999  359.690002  348.989990   
589188 2024-08-01 04:00:00+00:00  350.420013  353.859985  334.299988   
589189 2024-08-02 04:00:00+00:00  331.799988  332.130005  319.040009   
589190 2024-08-05 04:00:00+00:00  310.570007  319.649994  305.000000   
589191 2024-08-06 04:00:00+00:00  318.470001  322.910004  316.140015   

             Close   Adj Close  Volume Ticker      SMA_20      EMA_20  ...  \
589187  351.190002  351.190002  634400   ZBRA  326.451004  327.548474  ...   
589188  342.579987  342.579987  580900   ZBRA  327.937503  328.980047  ...   
589189  322.299988  322.299988  447800   ZBRA  328.527002  328.343851  ...   
589190  317.859985  317.859985  464600   ZBRA  328.696001  327.345388  ...   
589191  316.559998  316.559998  367400   ZBRA  328.762001  326.318208  ...   

        Sector_Consumer Cyclical  Sector_Consumer Defensive  Sector_Energy  \
5