In [1]:
import pandas as pd
import numpy as np
import zipfile
import os

MAGNIFICENT_7 = {'AAPL', 'MSFT', 'GOOG', 'AMZN', 'NVDA', 'META', 'TSLA'}

CACHE = {}

def extract_and_load(zip_path: str, filenames: list):
    """Extracts files from ZIP and loads them into DataFrames. Uses caching."""
    global CACHE
    
    if zip_path in CACHE:
        print("Using cached data...")
        return CACHE[zip_path]
    
    dataframes = {}
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for filename in filenames:
            with zip_ref.open(filename) as file:
                dataframes[filename] = pd.read_csv(file, delimiter=';', header=0)
    
    CACHE[zip_path] = dataframes
    return dataframes

def load_and_process(zip_path):
    print("Loading and processing Mag 7 data from ZIP...")
    
    filenames = ['us-shareprices-daily.csv', 'us-income-quarterly.csv', 'us-balance-quarterly.csv']
    data = extract_and_load(zip_path, filenames)
    
    df = data['us-shareprices-daily.csv']
    df['date'] = pd.to_datetime(df['Date'])
    df = df[df['Ticker'].isin(MAGNIFICENT_7)][['Ticker', 'date', 'Close']].rename(columns={'Close': 'close'})
    
    income_df = data['us-income-quarterly.csv']
    income_df['date'] = pd.to_datetime(income_df['Report Date'])
    income_df = income_df[income_df['Ticker'].isin(MAGNIFICENT_7)][['Ticker', 'date', 'Net Income', 'Shares (Basic)']]
    income_df['p_e_ratio'] = np.nan
    
    balance_df = data['us-balance-quarterly.csv']
    balance_df['date'] = pd.to_datetime(balance_df['Report Date'])
    balance_df = balance_df[balance_df['Ticker'].isin(MAGNIFICENT_7)][['Ticker', 'date', 'Total Liabilities', 'Total Equity']]
    
    merged_df = df.merge(income_df, on=['Ticker', 'date'], how='left')\
                  .merge(balance_df, on=['Ticker', 'date'], how='left')\
                  .sort_values(['Ticker', 'date'])\
                  .ffill()

    merged_df['p_e_ratio'] = merged_df['close'] / (merged_df['Net Income'] / merged_df['Shares (Basic)'].replace(0, np.nan))
    merged_df['p_e_ratio'] = merged_df['p_e_ratio'].where(merged_df['p_e_ratio'] >= 0, np.nan).clip(upper=500)

    merged_df['sma_50'] = merged_df.groupby('Ticker')['close'].rolling(window=50, min_periods=50).mean().reset_index(drop=True)
    merged_df['next_day_close'] = merged_df.groupby('Ticker')['close'].shift(-1)
    merged_df['next_day_direction'] = (merged_df['next_day_close'] > merged_df['close']).astype(int)

    feature_cols = ['close', 'p_e_ratio', 'sma_50']
    for col in feature_cols:
        merged_df[col] = merged_df[col].replace([np.inf, -np.inf], np.nan)
    merged_df = merged_df.dropna(subset=feature_cols + ['next_day_direction'])

    print("Feature stats:")
    print(merged_df[feature_cols].describe())

    # Save the processed dataset locally
    merged_df.to_csv('mag7_processed_final3.csv', index=False)
    print("Saved mag7_processed_final3.csv locally.")
    
    return merged_df

def main():
    zip_path = 'data/mag7_data.zip'  # Define zip_path here so it is accessible globally
    if not os.path.exists(zip_path):
        print(f"Error: ZIP file not found at {zip_path}")
        return
    
    result = load_and_process(zip_path)  # Now zip_path is correctly passed

if __name__ == "__main__":
    main()


Loading and processing Mag 7 data from ZIP...
Feature stats:
             close    p_e_ratio       sma_50
count  8127.000000  8127.000000  8127.000000
mean    155.364700   193.980066   152.022184
std      97.308702   138.756884    94.996129
min       3.620000    27.990842     3.870600
25%      86.950000    98.693750    78.956600
50%     146.140000   129.085524   143.678800
75%     218.470000   246.427372   216.149400
max     502.300000   500.000000   411.813400
Saved mag7_processed_final3.csv locally.
