<a href="https://colab.research.google.com/github/edithluv/Debunking-Market-Myths-Capstone-Project-Submission/blob/main/Debunking-Market-Myths-Capstone-Project-Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import yfinance as yf
from scipy.stats import ttest_1samp
import warnings
import traceback # Import traceback for detailed error printing

# Optional: Suppress warnings for cleaner output
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

def get_price_data(data_downloaded, ticker, column_preference=['Adj Close', 'Close']):
    """
    Helper function to get the preferred price column ('Adj Close' or 'Close')
    handling single and multi-level column structures.
    """
    # Check for multi-level columns first (most common issue now)
    for col_name in column_preference:
        column_tuple = (col_name, ticker) # e.g., ('Adj Close', '^SP500TR')
        if column_tuple in data_downloaded.columns:
            print(f"Using multi-level column: {column_tuple}")
            return data_downloaded[column_tuple]

    # Check for single-level columns if multi-level check failed
    for col_name in column_preference:
        if col_name in data_downloaded.columns:
             # Check if it's NOT a multi-index column before assuming single-level
             if not isinstance(data_downloaded.columns, pd.MultiIndex):
                 print(f"Using single-level column: {col_name}")
                 return data_downloaded[col_name]

    # If neither preferred column is found in either structure
    print(f"Warning: Neither {column_preference} found for {ticker}. Available columns: {list(data_downloaded.columns)}")
    return None


def analyze_myth1(ticker='^SP500TR', start_date='1970-01-01', end_date='2023-12-31', rolling_years=20):
    """
    Analyzes Myth 1, adapted for multi-level columns and using 'Close'
    as fallback if 'Adj Close' is missing.
    """
    print(f"\n--- Analyzing Myth 1 ---")
    print(f"Ticker: {ticker}, Rolling Window: {rolling_years} years, Period: {start_date} to {end_date}")

    try:
        data_downloaded = yf.download(ticker, start=start_date, end=end_date, interval='1mo', progress=False)

        # --- Diagnostic Start ---
        print(f"yf.download returned object of type: {type(data_downloaded)}")
        if isinstance(data_downloaded, pd.DataFrame):
            print(f"DataFrame is empty: {data_downloaded.empty}")
            print(f"DataFrame shape: {data_downloaded.shape}")
            print(f"DataFrame columns: {list(data_downloaded.columns)}")
        # --- Diagnostic End ---

        if not isinstance(data_downloaded, pd.DataFrame) or data_downloaded.empty:
            print(f"Error: Download failed or returned empty data for ticker '{ticker}'.")
            return None

        # --- Modified Section: Use helper to get price data ---
        price_data = get_price_data(data_downloaded, ticker, ['Adj Close', 'Close'])
        if price_data is None:
             print(f"Error: Could not find suitable price data ('Adj Close' or 'Close') for {ticker}.")
             return None
        # --- End Modified Section ---

        monthly_returns = price_data.pct_change().dropna()
        if monthly_returns.empty:
            print(f"Error: Not enough data to calculate returns. Original price data had {price_data.shape[0]} rows.")
            return None

        rolling_window = rolling_years * 12
        if len(monthly_returns) < rolling_window:
            print(f"Error: Insufficient data ({len(monthly_returns)} months) for {rolling_window}-month rolling window.")
            return None

        annualized_returns = ((1 + monthly_returns).rolling(window=rolling_window)
                              .apply(np.prod, raw=True)**(12/rolling_window)) - 1
        min_rolling_return = annualized_returns.min()

        if pd.isna(min_rolling_return):
             print("Error: Calculation resulted in NaN minimum return.")
             return None

        print(f"Minimum observed {rolling_years}-year rolling annualized return: {min_rolling_return:.4f}")
        result = min_rolling_return > 0
        print(f"Conclusion: Myth 1 (always positive {rolling_years}yr return) is {'supported' if result else 'NOT supported'} by historical data (using {'Adj Close' if ('Adj Close', ticker) in data_downloaded.columns else 'Close'} price).")
        return result

    except Exception as e:
        print(f"An unexpected error occurred during Myth 1 analysis: {e}")
        traceback.print_exc()
        return None


def analyze_myth2(ticker='SPY', start_date='2010-01-01', end_date='2023-12-31', commission_per_share=0.005):
    """
    Analyzes Myth 2, adapted for multi-level columns and using 'Close'
    as fallback if 'Adj Close' is missing for signal generation.
    """
    print(f"\n--- Analyzing Myth 2 ---")
    print(f"Ticker: {ticker}, Strategy: Momentum Day Trade, Costs: ${commission_per_share}/share round-trip, Period: {start_date} to {end_date}")
    try:
        data_downloaded = yf.download(ticker, start=start_date, end=end_date, interval='1d', progress=False)

        # --- Diagnostic Start ---
        print(f"yf.download returned object of type: {type(data_downloaded)}")
        if isinstance(data_downloaded, pd.DataFrame):
            print(f"DataFrame is empty: {data_downloaded.empty}")
            print(f"DataFrame shape: {data_downloaded.shape}")
            print(f"DataFrame columns: {list(data_downloaded.columns)}")
        # --- Diagnostic End ---

        if not isinstance(data_downloaded, pd.DataFrame) or data_downloaded.empty:
            print(f"Error: Download failed or returned empty data for ticker '{ticker}'.")
            return None

        # --- Modified Section: Define required columns using tuples ---
        required_col_metrics = ['Open', 'Close'] # Base requirements
        signal_price_metric = 'Adj Close' if ('Adj Close', ticker) in data_downloaded.columns else 'Close'
        print(f"Using '{signal_price_metric}' price for signal generation.")

        required_cols_tuples = [(metric, ticker) for metric in required_col_metrics]
        required_cols_tuples.append((signal_price_metric, ticker)) # Add the column used for signal

        if not all(col_tuple in data_downloaded.columns for col_tuple in required_cols_tuples):
            missing = [ct for ct in required_cols_tuples if ct not in data_downloaded.columns]
            print(f"Error: Missing required columns in downloaded data for '{ticker}'. Missing: {missing}. Available: {list(data_downloaded.columns)}")
            return None
        # --- End Modified Section ---

        # Access data using tuples
        open_price = data_downloaded[('Open', ticker)]
        close_price = data_downloaded[('Close', ticker)]
        signal_price = data_downloaded[(signal_price_metric, ticker)]

        # Calculate returns and signal using the chosen signal price
        return_prev_day = signal_price.pct_change()
        signal = np.where(return_prev_day > 0, 1, 0)

        # Calculate gross return using Open and Close prices
        gross_daily_return = np.where(signal == 1, (close_price / open_price) - 1, 0)

        # Calculate net return
        transaction_cost = commission_per_share
        net_daily_return = np.where(signal == 1, gross_daily_return - transaction_cost, 0)

        # Need to convert Series to DataFrame temporarily for dropna if needed, or handle NaNs carefully
        # Let's create a temporary DataFrame for easier handling
        analysis_df = pd.DataFrame({
            'Net Daily Return': net_daily_return
        }).dropna()

        if analysis_df.empty:
            print("Error: No valid net returns available for t-test after dropping NaNs.")
            return None

        net_returns_for_test = analysis_df['Net Daily Return']
        avg_net_return = net_returns_for_test.mean()
        t_stat, p_value = ttest_1samp(net_returns_for_test, 0, nan_policy='omit')

        print(f"Average Net Daily Return: {avg_net_return:.6f}")
        print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
        result = avg_net_return > 0 and p_value < 0.05
        print(f"Conclusion: Myth 2 (consistent net profit from strategy) is {'supported' if result else 'NOT supported'} by statistical test (using {signal_price_metric} for signal).")
        return result

    except Exception as e:
        print(f"An unexpected error occurred during Myth 2 analysis: {e}")
        traceback.print_exc()
        return None


def analyze_myth3(assets={'Low Vol': 'BIL', 'Med Vol': 'SPY', 'High Vol': 'IWM'},
                  start_date='2005-01-01', end_date='2023-12-31',
                  risk_free_rate_annual=0.01):
    """
    Analyzes Myth 3, adapted for multi-level columns and using 'Close'
    as fallback if 'Adj Close' is missing.
    """
    print(f"\n--- Analyzing Myth 3 ---")
    print(f"Assets: {assets}, Risk-Free Rate: {risk_free_rate_annual:.2%}, Period: {start_date} to {end_date}")

    all_monthly_returns = pd.DataFrame()
    download_success = False

    for name, ticker in assets.items():
        print(f"Processing asset: {name} ({ticker})")
        try:
            data_downloaded = yf.download(ticker, start=start_date, end=end_date, interval='1mo', progress=False)

            # --- Diagnostic Start ---
            print(f"  yf.download returned type: {type(data_downloaded)}")
            if isinstance(data_downloaded, pd.DataFrame):
                 print(f"  DataFrame empty: {data_downloaded.empty}, shape: {data_downloaded.shape}, columns: {list(data_downloaded.columns)}")
            # --- Diagnostic End ---

            if not isinstance(data_downloaded, pd.DataFrame) or data_downloaded.empty:
                 print(f"  Warning: Download failed or returned empty data for {ticker} ({name}). Skipping.")
                 continue # Skip to next asset

            # --- Modified Section: Use helper to get price data ---
            price_data = get_price_data(data_downloaded, ticker, ['Adj Close', 'Close'])
            if price_data is None:
                 print(f"  Warning: Could not find suitable price data for {ticker} ({name}). Skipping.")
                 continue # Skip to next asset
            # --- End Modified Section ---

            monthly_returns_series = price_data.pct_change()
            all_monthly_returns[name] = monthly_returns_series # Assign Series to column 'name'
            print(f"  Successfully processed {ticker} ({name}) using {'Adj Close' if ('Adj Close', ticker) in data_downloaded.columns else 'Close'} price.")
            download_success = True

        except Exception as e:
            print(f"  Warning: Error processing {ticker} ({name}): {e}")

    if not download_success or all_monthly_returns.empty:
        print("Error: No return data available for any asset after download attempts.")
        return None

    all_monthly_returns.dropna(inplace=True)
    if all_monthly_returns.empty:
        print("Error: All return data resulted in NaN after dropping initial NaNs.")
        return None

    print(f"Shape of combined monthly returns data (after dropna): {all_monthly_returns.shape}")

    months_in_year = 12
    annualized_return = all_monthly_returns.mean() * months_in_year
    annualized_vol = all_monthly_returns.std() * np.sqrt(months_in_year)
    annualized_vol_safe = annualized_vol.replace(0, np.nan)
    sharpe_ratio = (annualized_return - risk_free_rate_annual) / annualized_vol_safe

    results_df = pd.DataFrame({
        'Ticker': [assets[name] for name in annualized_return.index],
        'Annualized Return': annualized_return,
        'Annualized Volatility': annualized_vol,
        'Sharpe Ratio': sharpe_ratio
    }, index=annualized_return.index)

    print("\nMyth 3 Results (Full Period):")
    print(results_df.to_string(float_format='{:.4f}'.format))

    results_df_dropna = results_df.dropna(subset=['Sharpe Ratio', 'Annualized Volatility'])
    if len(results_df_dropna) < 2:
         print("\nConclusion: Cannot test Myth 3 premise with fewer than two valid data points.")
         return results_df

    sorted_by_vol = results_df_dropna.sort_values(by='Annualized Volatility')
    sharpe_increases_with_vol = all(np.diff(sorted_by_vol['Sharpe Ratio'].values) >= 0)
    print(f"\nConclusion: Myth 3 (Sharpe Ratio consistently increases with Volatility) is {'supported' if sharpe_increases_with_vol else 'NOT supported'} across these assets for this period.")
    return results_df


# Main Execution Block
if __name__ == "__main__":
    print("===================================================")
    print(" Starting Quantitative Market Myth Analysis        ")
    print("===================================================")

    myth1_result = analyze_myth1()
    myth2_result = analyze_myth2()
    myth3_results_df = analyze_myth3()

    print("\n===================================================")
    print(" Analysis Summary Report                           ")
    print("===================================================")
    myth1_status = 'Failed/Incomplete' if myth1_result is None else f'Supported = {myth1_result}'
    myth2_status = 'Failed/Incomplete' if myth2_result is None else f'Supported = {myth2_result}'
    myth3_status = 'Failed/Incomplete'
    if myth3_results_df is not None:
        results_df_dropna = myth3_results_df.dropna(subset=['Sharpe Ratio', 'Annualized Volatility'])
        if len(results_df_dropna) >= 2:
            sorted_by_vol = results_df_dropna.sort_values(by='Annualized Volatility')
            sharpe_increases_with_vol = all(np.diff(sorted_by_vol['Sharpe Ratio'].values) >= 0)
            myth3_status = f'Supported = {sharpe_increases_with_vol}'
        else:
             myth3_status = 'Completed (Insufficient data for premise test)'

    print(f"* Myth 1 (Market Always Rises Long-Term): {myth1_status}")
    print(f"* Myth 2 (Day Trading Easy Profit w/ Costs): {myth2_status}")
    print(f"* Myth 3 (Higher Volatility -> Higher Sharpe): {myth3_status}")
    if myth3_results_df is not None:
         print("  (See detailed table above for Myth 3 results if analysis completed)")

    print("===================================================")
    print("\nNote on Data Used:")
    print(" - Analysis attempted to use 'Adj Close' prices first.")
    print(" - If 'Adj Close' was unavailable, 'Close' prices were used as a fallback.")
    print(" - Using 'Close' price does not account for dividends or stock splits.")
    print("===================================================")



 Starting Quantitative Market Myth Analysis        

--- Analyzing Myth 1 ---
Ticker: ^SP500TR, Rolling Window: 20 years, Period: 1970-01-01 to 2023-12-31
yf.download returned object of type: <class 'pandas.core.frame.DataFrame'>
DataFrame is empty: False
DataFrame shape: (432, 5)
DataFrame columns: [('Close', '^SP500TR'), ('High', '^SP500TR'), ('Low', '^SP500TR'), ('Open', '^SP500TR'), ('Volume', '^SP500TR')]
Using multi-level column: ('Close', '^SP500TR')
Minimum observed 20-year rolling annualized return: 0.0479
Conclusion: Myth 1 (always positive 20yr return) is supported by historical data (using Close price).

--- Analyzing Myth 2 ---
Ticker: SPY, Strategy: Momentum Day Trade, Costs: $0.005/share round-trip, Period: 2010-01-01 to 2023-12-31
yf.download returned object of type: <class 'pandas.core.frame.DataFrame'>
DataFrame is empty: False
DataFrame shape: (3522, 5)
DataFrame columns: [('Close', 'SPY'), ('High', 'SPY'), ('Low', 'SPY'), ('Open', 'SPY'), ('Volume', 'SPY')]
Using 'C