# Fundamental Data Download and Preparation

This notebook is to download the required data from yfinance once for all the interested tickers. Once we have the data, we no longer need to keep going out to the internet and request more data, reducing network load

In [1]:
%pip install yfinance
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime

## Select tickers

In [3]:
import os

# Get tickers from data folder
data_files = [f for f in os.listdir('data') if f.endswith('_stock_data.csv')]
tickers = [f.replace('_stock_data.csv', '') for f in data_files]

print(f"Found tickers with technical data: {tickers}")
print(f"Total tickers found: {len(tickers)}")

Found tickers with technical data: ['AAPL', 'ACMR', 'AFRM', 'ALAB', 'AMSC', 'AMZN', 'ANET', 'APH', 'ATAT', 'ATI', 'BK', 'BRK-B', 'CCJ', 'CLS', 'COIN', 'CRCL', 'CRWD', 'C', 'DASH', 'DAVE', 'DOCS', 'FIX', 'FRFHF', 'FUTU', 'GOOGL', 'HEI', 'HOOD', 'IREN', 'JBL', 'JPM', 'KLAC', 'KNSA', 'LIF', 'META', 'MIRM', 'MSFT', 'NET', 'NVDA', 'ONC', 'OUST', 'PLTR', 'PWR', 'RBLX', 'RKLB', 'RMBS', 'RYTM', 'SFM', 'SHOP', 'SNEX', 'SNOW', 'SOFI', 'SYM', 'TBBK', 'TOST', 'TSLA', 'TSM', 'TSSI', 'URBN', 'VEEV', 'VRT', 'WMT', 'XOM', 'ZS']
Total tickers found: 63


## For each ticker, download the info and fundamental data

In [12]:
import yfinance as yf
import json
import os

def extract_key_fundamentals(ticker):
   """Extract key fundamental metrics for analysis"""
   
   ticker_obj = yf.Ticker(ticker)
   info = ticker_obj.info
   
   # Key metrics for stock analysis
   key_metrics = {
       'company_info': {
           'symbol': info.get('symbol'),
           'longName': info.get('longName'),
           'sector': info.get('sector'),
           'industry': info.get('industry')
       },
       'valuation': {
           'marketCap': info.get('marketCap'),
           'currentPrice': info.get('currentPrice'),
           'trailingPE': info.get('trailingPE'),
           'forwardPE': info.get('forwardPE'),
           'priceToBook': info.get('priceToBook'),
           'priceToSales': info.get('priceToSalesTrailing12Months')
       },
       'financial_performance': {
           'totalRevenue': info.get('totalRevenue'),
           'netIncomeToCommon': info.get('netIncomeToCommon'),
           'returnOnAssets': info.get('returnOnAssets'),
           'returnOnEquity': info.get('returnOnEquity'),
           'profitMargins': info.get('profitMargins'),
           'operatingMargins': info.get('operatingMargins')
       },
       'balance_sheet': {
           'totalCash': info.get('totalCash'),
           'totalDebt': info.get('totalDebt'),
           'bookValue': info.get('bookValue'),
           'totalCashPerShare': info.get('totalCashPerShare')
       },
       'growth_metrics': {
           'earningsGrowth': info.get('earningsGrowth'),
           'revenueGrowth': info.get('revenueGrowth'),
           'earningsQuarterlyGrowth': info.get('earningsQuarterlyGrowth')
       },
       'dividend_info': {
           'dividendRate': info.get('dividendRate'),
           'dividendYield': info.get('dividendYield'),
           'payoutRatio': info.get('payoutRatio')
       }
   }
   
   return key_metrics

# Create fundamentals folder
os.makedirs('fundamentals', exist_ok=True)

# Extract key metrics for all tickers
for ticker in tickers:
   print(f"\nProcessing {ticker}...")
   try:
       key_data = extract_key_fundamentals(ticker)
       
       # Save to file
       filename = f'fundamentals/{ticker}_key_info.json'
       with open(filename, 'w') as f:
           json.dump(key_data, f, indent=2, default=str)
       
       print(f"✅ Key fundamentals saved to {filename}")
       
   except Exception as e:
       print(f"❌ Error processing {ticker}: {e}")

print(f"\nCompleted processing fundamentals for {len(tickers)} tickers")


Processing AAPL...
✅ Key fundamentals saved to fundamentals/AAPL_key_info.json

Processing ACMR...
✅ Key fundamentals saved to fundamentals/ACMR_key_info.json

Processing AFRM...
✅ Key fundamentals saved to fundamentals/AFRM_key_info.json

Processing ALAB...
✅ Key fundamentals saved to fundamentals/ALAB_key_info.json

Processing AMSC...
✅ Key fundamentals saved to fundamentals/AMSC_key_info.json

Processing AMZN...
✅ Key fundamentals saved to fundamentals/AMZN_key_info.json

Processing ANET...
✅ Key fundamentals saved to fundamentals/ANET_key_info.json

Processing APH...
✅ Key fundamentals saved to fundamentals/APH_key_info.json

Processing ATAT...
✅ Key fundamentals saved to fundamentals/ATAT_key_info.json

Processing ATI...
✅ Key fundamentals saved to fundamentals/ATI_key_info.json

Processing BK...
✅ Key fundamentals saved to fundamentals/BK_key_info.json

Processing BRK-B...
✅ Key fundamentals saved to fundamentals/BRK-B_key_info.json

Processing CCJ...
✅ Key fundamentals saved to

In [13]:
import yfinance as yf
import json
import os
import pandas as pd

# Get tickers from data folder
data_files = [f for f in os.listdir('data') if f.endswith('_stock_data.csv')]
tickers = [f.replace('_stock_data.csv', '') for f in data_files]

def extract_quarterly_key_fundamentals(ticker):
    """Extract key quarterly fundamental metrics"""
    
    ticker_obj = yf.Ticker(ticker)
    
    # Get quarterly data
    quarterly_financials = ticker_obj.quarterly_financials
    quarterly_balance_sheet = ticker_obj.quarterly_balance_sheet
    quarterly_cashflow = ticker_obj.quarterly_cashflow
    
    # Extract key metrics for last 20 quarters - 5 years
    quarterly_data = {}
    
    if not quarterly_financials.empty:
        for date in quarterly_financials.columns[:20]:  # Last 20 quarters
            quarter_str = date.strftime('%Y-%m-%d') if hasattr(date, 'strftime') else str(date)
            
            quarterly_data[quarter_str] = {
                'income_statement': {
                    'total_revenue': quarterly_financials.loc['Total Revenue', date] if 'Total Revenue' in quarterly_financials.index else None,
                    'net_interest_income': quarterly_financials.loc['Net Interest Income', date] if 'Net Interest Income' in quarterly_financials.index else None,
                    'net_income': quarterly_financials.loc['Net Income', date] if 'Net Income' in quarterly_financials.index else None,
                    'diluted_eps': quarterly_financials.loc['Diluted EPS', date] if 'Diluted EPS' in quarterly_financials.index else None
                },
                'balance_sheet': {
                    'total_assets': quarterly_balance_sheet.loc['Total Assets', date] if 'Total Assets' in quarterly_balance_sheet.index else None,
                    'total_debt': quarterly_balance_sheet.loc['Total Debt', date] if 'Total Debt' in quarterly_balance_sheet.index else None,
                    'stockholders_equity': quarterly_balance_sheet.loc['Stockholders Equity', date] if 'Stockholders Equity' in quarterly_balance_sheet.index else None,
                    'book_value': quarterly_balance_sheet.loc['Tangible Book Value', date] if 'Tangible Book Value' in quarterly_balance_sheet.index else None
                },
                'cash_flow': {
                    'operating_cash_flow': quarterly_cashflow.loc['Operating Cash Flow', date] if 'Operating Cash Flow' in quarterly_cashflow.index else None,
                    'free_cash_flow': quarterly_cashflow.loc['Free Cash Flow', date] if 'Free Cash Flow' in quarterly_cashflow.index else None,
                    'capital_expenditure': quarterly_cashflow.loc['Capital Expenditure', date] if 'Capital Expenditure' in quarterly_cashflow.index else None
                }
            }
    
    return quarterly_data

# Extract quarterly data for all tickers
print("Extracting 5 years of quarterly fundamentals for all tickers...")
for ticker in tickers:
    print(f"\nProcessing quarterly data for {ticker}...")
    try:
        quarterly_data = extract_quarterly_key_fundamentals(ticker)
        
        # Save to file (this will overwrite the old 4-quarter files)
        filename = f'fundamentals/{ticker}_quarterly_key_metrics.json'
        with open(filename, 'w') as f:
            json.dump(quarterly_data, f, indent=2, default=str)
        
        print(f"✅ {len(quarterly_data)} quarters of data saved to {filename}")
        
        # Show summary for first few quarters
        count = 0
        for quarter, data in quarterly_data.items():
            if count < 2:  # Show only first 2 quarters to avoid clutter
                print(f"  {quarter}:")
                if data['income_statement']['total_revenue']:
                    print(f"    Revenue: ${data['income_statement']['total_revenue']:,.0f}")
                if data['income_statement']['net_income']:
                    print(f"    Net Income: ${data['income_statement']['net_income']:,.0f}")
            count += 1
        
    except Exception as e:
        print(f"❌ Error processing {ticker}: {e}")

print(f"\n🎉 Completed processing quarterly fundamentals for {len(tickers)} tickers")
print("Now we have ~20 quarters (5 years) of data instead of just 4!")
print("This will significantly improve our regression model reliability.")

Extracting 5 years of quarterly fundamentals for all tickers...

Processing quarterly data for AAPL...
✅ 5 quarters of data saved to fundamentals/AAPL_quarterly_key_metrics.json
  2025-03-31:
    Revenue: $95,359,000,000
    Net Income: $24,780,000,000
  2024-12-31:
    Revenue: $124,300,000,000
    Net Income: $36,330,000,000

Processing quarterly data for ACMR...
✅ 5 quarters of data saved to fundamentals/ACMR_quarterly_key_metrics.json
  2025-03-31:
    Revenue: $172,347,000
    Net Income: $20,380,000
  2024-12-31:
    Revenue: $223,471,000
    Net Income: $31,080,000

Processing quarterly data for AFRM...
✅ 6 quarters of data saved to fundamentals/AFRM_quarterly_key_metrics.json
  2025-03-31:
    Revenue: $783,134,000
    Net Income: $2,804,000
  2024-12-31:
    Revenue: $866,381,000
    Net Income: $80,360,000

Processing quarterly data for ALAB...
✅ 5 quarters of data saved to fundamentals/ALAB_quarterly_key_metrics.json
  2025-03-31:
    Revenue: $159,442,000
    Net Income: $3

In [4]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import yfinance as yf
from datetime import datetime

def get_latest_quarterly_data(ticker):
    """Try to get the most recent quarterly data from yfinance"""
    
    ticker_obj = yf.Ticker(ticker)
    
    # Get quarterly financials
    quarterly_financials = ticker_obj.quarterly_financials
    quarterly_balance_sheet = ticker_obj.quarterly_balance_sheet
    
    print(f"\n{ticker} - Latest Available Quarters:")
    if not quarterly_financials.empty:
        latest_quarters = quarterly_financials.columns[:4]  # Show last 4 quarters
        for date in latest_quarters:
            print(f"  {date.strftime('%Y-%m-%d')}")
    
    return quarterly_financials, quarterly_balance_sheet

# Test with a few tickers
test_tickers = ['JPM', 'BAC', 'WFC']  # Major banks report quickly

for ticker in test_tickers:
    financials, balance_sheet = get_latest_quarterly_data(ticker)


JPM - Latest Available Quarters:
  2025-03-31
  2024-12-31
  2024-09-30
  2024-06-30

BAC - Latest Available Quarters:
  2025-03-31
  2024-12-31
  2024-09-30
  2024-06-30

WFC - Latest Available Quarters:
  2025-03-31
  2024-12-31
  2024-09-30
  2024-06-30


In [6]:
import pandas as pd
import numpy as np
import yfinance as yf
import json
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Try to import ML libraries (optional)
try:
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_absolute_error
    import xgboost as xgb
    ML_AVAILABLE = True
except ImportError:
    ML_AVAILABLE = False
    print("⚠️ ML libraries not available. Using fundamental methods only.")

class CompletePredictionSystem:
    """Complete stock prediction system integrating all our developed methods"""
    
    def __init__(self, ticker):
        self.ticker = ticker.upper()
        self.data_folder = 'data'
        self.fundamentals_folder = 'fundamentals'
        self.results_folder = 'results'
        
        # Create folders if they don't exist
        for folder in [self.data_folder, self.fundamentals_folder, self.results_folder]:
            os.makedirs(folder, exist_ok=True)
        
        # Initialize data containers
        self.stock_data = None
        self.quarterly_fundamentals = None
        self.key_info = None
        self.correlation_data = None
        
        print(f"🎯 Initializing Complete Prediction System for {self.ticker}")
    
    def download_stock_data(self, period="2y"):
        """Download stock price data"""
        try:
            print(f"📊 Downloading stock data for {self.ticker}...")
            ticker_obj = yf.Ticker(self.ticker)
            self.stock_data = ticker_obj.history(period=period)
            
            if self.stock_data.empty:
                raise ValueError(f"No stock data found for {self.ticker}")
            
            # Save to CSV
            filepath = f"{self.data_folder}/{self.ticker}_stock_data.csv"
            self.stock_data.to_csv(filepath)
            print(f"✅ Stock data saved to {filepath}")
            
            return True
            
        except Exception as e:
            print(f"❌ Error downloading stock data: {e}")
            return False
    
    def download_fundamentals(self):
        """Download quarterly fundamentals and key info"""
        try:
            print(f"📈 Downloading fundamentals for {self.ticker}...")
            ticker_obj = yf.Ticker(self.ticker)
            
            # Get quarterly data
            quarterly_financials = ticker_obj.quarterly_financials
            quarterly_balance_sheet = ticker_obj.quarterly_balance_sheet
            quarterly_cashflow = ticker_obj.quarterly_cashflow
            
            # Extract quarterly fundamentals
            quarterly_data = {}
            
            if not quarterly_financials.empty:
                for date in quarterly_financials.columns[:8]:  # Last 8 quarters
                    quarter_str = date.strftime('%Y-%m-%d') if hasattr(date, 'strftime') else str(date)
                    
                    quarterly_data[quarter_str] = {
                        'income_statement': {
                            'total_revenue': self._safe_extract(quarterly_financials, 'Total Revenue', date),
                            'net_interest_income': self._safe_extract(quarterly_financials, 'Net Interest Income', date),
                            'net_income': self._safe_extract(quarterly_financials, 'Net Income', date),
                            'diluted_eps': self._safe_extract(quarterly_financials, 'Diluted EPS', date)
                        },
                        'balance_sheet': {
                            'total_assets': self._safe_extract(quarterly_balance_sheet, 'Total Assets', date),
                            'total_debt': self._safe_extract(quarterly_balance_sheet, 'Total Debt', date),
                            'stockholders_equity': self._safe_extract(quarterly_balance_sheet, 'Stockholders Equity', date),
                            'book_value': self._safe_extract(quarterly_balance_sheet, 'Tangible Book Value', date)
                        },
                        'cash_flow': {
                            'operating_cash_flow': self._safe_extract(quarterly_cashflow, 'Operating Cash Flow', date),
                            'free_cash_flow': self._safe_extract(quarterly_cashflow, 'Free Cash Flow', date),
                            'capital_expenditure': self._safe_extract(quarterly_cashflow, 'Capital Expenditure', date)
                        }
                    }
            
            # Save quarterly fundamentals
            quarterly_file = f"{self.fundamentals_folder}/{self.ticker}_quarterly_key_metrics.json"
            with open(quarterly_file, 'w') as f:
                json.dump(quarterly_data, f, indent=2, default=str)
            
            # Get key info
            info = ticker_obj.info
            key_info = {
                'valuation': {
                    'marketCap': info.get('marketCap'),
                    'currentPrice': info.get('currentPrice'),
                    'beta': info.get('beta')
                },
                'dividend_info': {
                    'dividendRate': info.get('dividendRate'),
                    'dividendYield': info.get('dividendYield'),
                    'payoutRatio': info.get('payoutRatio')
                },
                'basic_info': {
                    'sector': info.get('sector'),
                    'industry': info.get('industry'),
                    'longName': info.get('longName')
                }
            }
            
            # Save key info
            info_file = f"{self.fundamentals_folder}/{self.ticker}_key_info.json"
            with open(info_file, 'w') as f:
                json.dump(key_info, f, indent=2, default=str)
            
            self.quarterly_fundamentals = quarterly_data
            self.key_info = key_info
            
            print(f"✅ Fundamentals saved: {len(quarterly_data)} quarters available")
            return True
            
        except Exception as e:
            print(f"❌ Error downloading fundamentals: {e}")
            return False
    
    def _safe_extract(self, df, key, date):
        """Safely extract value from DataFrame"""
        try:
            if key in df.index:
                value = df.loc[key, date]
                return float(value) if pd.notna(value) else None
            return None
        except:
            return None
    
# Quick fix - replace the create_correlation_dataset method in your script
def create_correlation_dataset(self):
    """Create aligned dataset for correlation analysis - FIXED VERSION"""
    try:
        print(f"🔗 Creating correlation dataset...")
        
        if self.stock_data is None or self.quarterly_fundamentals is None:
            raise ValueError("Stock data and fundamentals must be downloaded first")
        
        # Get shares outstanding
        market_cap = self.key_info.get('valuation', {}).get('marketCap')
        current_price = self.key_info.get('valuation', {}).get('currentPrice')
        
        if market_cap and current_price:
            shares_outstanding = float(market_cap) / float(current_price)
        else:
            # Alternative calculation using recent price and market cap
            shares_outstanding = 1e9  # Default fallback for large companies like GOOGL
            print("⚠️ Using estimated shares outstanding")
        
        correlation_data = []
        
        for quarter, data in self.quarterly_fundamentals.items():
            try:
                quarter_date = pd.to_datetime(quarter)
                quarter_start = quarter_date - pd.DateOffset(months=3)
                
                # Handle timezone issues
                stock_data_to_use = self.stock_data.copy()
                
                # Make all dates timezone-naive for comparison
                if hasattr(stock_data_to_use.index, 'tz') and stock_data_to_use.index.tz is not None:
                    stock_data_to_use.index = stock_data_to_use.index.tz_convert(None)
                
                if hasattr(quarter_date, 'tz') and quarter_date.tz is not None:
                    quarter_date = quarter_date.tz_localize(None)
                    quarter_start = quarter_start.tz_localize(None)
                
                # Get average stock price for quarter
                mask = (stock_data_to_use.index >= quarter_start) & (stock_data_to_use.index <= quarter_date)
                quarter_prices = stock_data_to_use.loc[mask, 'Close']
                
                avg_stock_price = quarter_prices.mean() if len(quarter_prices) > 0 else None
                
                if avg_stock_price and pd.notna(avg_stock_price):
                    # Calculate per share metrics
                    total_debt = data['balance_sheet']['total_debt']
                    debt_per_share = (total_debt / shares_outstanding) if total_debt else None
                    
                    total_book_value = data['balance_sheet']['book_value']
                    book_value_per_share = (total_book_value / shares_outstanding) if total_book_value else None
                    
                    diluted_eps = data['income_statement']['diluted_eps']
                    
                    # Calculate ratios with safety checks
                    roe = None
                    if diluted_eps and total_book_value and shares_outstanding:
                        try:
                            roe = (diluted_eps * shares_outstanding) / total_book_value
                        except:
                            pass
                    
                    pb_ratio = None
                    if book_value_per_share and book_value_per_share > 0:
                        pb_ratio = avg_stock_price / book_value_per_share
                    
                    pe_ratio = None
                    if diluted_eps and diluted_eps > 0:
                        pe_ratio = avg_stock_price / diluted_eps
                    
                    row = {
                        'quarter': quarter,
                        'Avg Price': float(avg_stock_price),
                        'Earnings': float(diluted_eps) if diluted_eps else None,
                        'Debt': float(debt_per_share) if debt_per_share else None,
                        'Book Value': float(book_value_per_share) if book_value_per_share else None,
                        'ROE': float(roe) if roe else None,
                        'P/B Ratio': float(pb_ratio) if pb_ratio else None,
                        'P/E Ratio': float(pe_ratio) if pe_ratio else None
                    }
                    correlation_data.append(row)
                    
            except Exception as quarter_error:
                print(f"⚠️ Skipping quarter {quarter}: {quarter_error}")
                continue
        
        self.correlation_data = pd.DataFrame(correlation_data) if correlation_data else None
        
        if self.correlation_data is not None and len(self.correlation_data) > 0:
            print(f"✅ Correlation dataset created: {len(self.correlation_data)} quarters")
            print(f"   Available data: {list(self.correlation_data.columns)}")
        else:
            print("❌ Failed to create correlation dataset")
        
        return self.correlation_data
        
    except Exception as e:
        print(f"❌ Error creating correlation dataset: {e}")
        import traceback
        traceback.print_exc()
        return None
    
        
    def simple_price_prediction(self):
        """Method 1: Simple correlation-based prediction"""
        try:
            print(f"🎯 Running simple correlation prediction...")
            
            if self.correlation_data is None:
                raise ValueError("Correlation dataset not available")
            
            df = self.correlation_data.dropna()
            
            if len(df) < 3:
                return None, "Insufficient data for simple prediction"
            
            # Determine stock type based on correlations
            corr_matrix = df[['Avg Price', 'Book Value', 'P/B Ratio']].corr()
            book_value_corr = abs(corr_matrix.loc['Avg Price', 'Book Value'])
            pb_ratio_corr = abs(corr_matrix.loc['Avg Price', 'P/B Ratio'])
            
            stock_type = "Value" if book_value_corr > pb_ratio_corr else "Growth"
            
            current_price = df['Avg Price'].iloc[0]
            current_book_value = df['Book Value'].iloc[0]
            
            if stock_type == "Value":
                # Value stock prediction
                df['Historical_PB'] = df['Avg Price'] / df['Book Value']
                avg_pb_ratio = df['Historical_PB'].mean()
                avg_roe = df['ROE'].mean() if 'ROE' in df.columns else 0.02
                
                next_quarter_book_value = current_book_value * (1 + avg_roe/4)
                predicted_price = next_quarter_book_value * avg_pb_ratio
                
                method = f"Book Value Growth (ROE: {avg_roe:.2%}, Avg P/B: {avg_pb_ratio:.2f})"
            else:
                # Growth stock prediction
                current_pb = df['P/B Ratio'].iloc[0]
                pb_trend = np.polyfit(range(len(df)), df['P/B Ratio'], 1)[0]
                
                next_quarter_pb = current_pb + pb_trend
                predicted_price = current_book_value * next_quarter_pb
                
                method = f"P/B Trend (Current: {current_pb:.2f}, Trend: {pb_trend:+.3f})"
            
            price_change = (predicted_price - current_price) / current_price * 100
            
            result = {
                'method': 'Simple Correlation',
                'stock_type': stock_type,
                'current_price': current_price,
                'predicted_price': predicted_price,
                'price_change_pct': price_change,
                'prediction_method': method,
                'correlation_strength': max(book_value_corr, pb_ratio_corr),
                'confidence': 'High' if max(book_value_corr, pb_ratio_corr) > 0.7 else 'Medium'
            }
            
            return result, None
            
        except Exception as e:
            return None, f"Simple prediction error: {str(e)}"
    
    def ml_ensemble_prediction(self):
        """Method 3: ML Ensemble prediction (if libraries available)"""
        if not ML_AVAILABLE:
            return None, "ML libraries not available"
        
        try:
            print(f"🤖 Running ML ensemble prediction...")
            
            if self.correlation_data is None:
                return None, "Correlation dataset not available"
            
            df = self.correlation_data.dropna()
            
            if len(df) < 4:
                return None, "Insufficient data for ML prediction"
            
            # Prepare features
            feature_cols = ['Book Value', 'Earnings', 'ROE', 'Debt', 'P/B Ratio']
            available_features = [col for col in feature_cols if col in df.columns]
            
            X = df[available_features].fillna(0)
            y = df['Avg Price']
            
            if len(X) < 3:
                return None, "Insufficient feature data"
            
            # Split data
            X_train = X.iloc[:-1]
            y_train = y.iloc[:-1]
            X_current = X.iloc[-1:].copy()
            current_price = y.iloc[-1]
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_current_scaled = scaler.transform(X_current)
            
            # Train models
            models = {}
            predictions = {}
            
            # Random Forest
            try:
                rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
                rf.fit(X_train, y_train)
                predictions['RandomForest'] = rf.predict(X_current)[0]
                models['RandomForest'] = rf
            except:
                pass
            
            # XGBoost
            try:
                xgb_model = xgb.XGBRegressor(n_estimators=50, max_depth=4, random_state=42, verbosity=0)
                xgb_model.fit(X_train, y_train)
                predictions['XGBoost'] = xgb_model.predict(X_current)[0]
                models['XGBoost'] = xgb_model
            except:
                pass
            
            if not predictions:
                return None, "No ML models succeeded"
            
            # Ensemble prediction
            ensemble_prediction = np.mean(list(predictions.values()))
            price_change = (ensemble_prediction - current_price) / current_price * 100
            
            # Calculate confidence
            pred_std = np.std(list(predictions.values()))
            confidence = "High" if pred_std < current_price * 0.05 else "Medium" if pred_std < current_price * 0.10 else "Low"
            
            result = {
                'method': 'ML Ensemble',
                'current_price': current_price,
                'predicted_price': ensemble_prediction,
                'price_change_pct': price_change,
                'individual_predictions': predictions,
                'confidence': confidence,
                'feature_count': len(available_features),
                'data_points': len(df)
            }
            
            return result, None
            
        except Exception as e:
            return None, f"ML ensemble error: {str(e)}"
    
    def dcf_fundamental_prediction(self):
        """Method 5: DCF fundamental prediction"""
        try:
            print(f"💰 Running DCF fundamental prediction...")
            
            if self.correlation_data is None or self.key_info is None:
                return None, "Required data not available"
            
            df = self.correlation_data.dropna()
            
            if len(df) < 3:
                return None, "Insufficient data for DCF"
            
            current_price = df['Avg Price'].iloc[0]
            current_book_value = df['Book Value'].iloc[0]
            current_earnings = df['Earnings'].iloc[0]
            current_roe = df['ROE'].iloc[0] if pd.notna(df['ROE'].iloc[0]) else 0.1
            
            # Extract dividend information
            dividend_rate = float(self.key_info.get('dividend_info', {}).get('dividendRate', 0) or 0)
            dividend_yield = float(self.key_info.get('dividend_info', {}).get('dividendYield', 0) or 0)
            payout_ratio = float(self.key_info.get('dividend_info', {}).get('payoutRatio', 0) or 0)
            
            # Calculate historical growth rates
            if len(df) >= 4:
                book_value_growth = df['Book Value'].pct_change().mean() * 4
                earnings_growth = df['Earnings'].pct_change().mean() * 4
            else:
                book_value_growth = current_roe * 0.6
                earnings_growth = current_roe * 0.8
            
            # Clean up extreme values
            book_value_growth = np.clip(book_value_growth, -0.5, 0.5)
            earnings_growth = np.clip(earnings_growth, -0.5, 0.5)
            
            # Model parameters
            risk_free_rate = 0.04
            equity_risk_premium = 0.06
            beta = float(self.key_info.get('valuation', {}).get('beta', 1.2) or 1.2)
            required_return = np.clip(risk_free_rate + beta * equity_risk_premium, 0.08, 0.20)
            
            # DCF Models
            models = {}
            
            # Residual Income Model
            future_roe = np.clip(current_roe, 0.05, 0.30)
            retention_ratio = np.clip(1 - (payout_ratio if payout_ratio > 0 else 0.4), 0.3, 0.9)
            future_book_value = current_book_value * (1 + future_roe * retention_ratio)
            
            if future_roe > required_return:
                residual_income = (future_roe - required_return) * future_book_value
                pv_residual_income = sum([residual_income * (0.8 ** i) / ((1 + required_return) ** i) 
                                        for i in range(1, 6)])
                models['RIM'] = future_book_value + pv_residual_income
            else:
                models['RIM'] = future_book_value
            
            # Earnings Power Value
            if current_earnings > 0:
                normalized_earnings = current_earnings * (1 + earnings_growth * 0.5)
                models['EPV'] = max(normalized_earnings / required_return, current_book_value * 0.8)
            else:
                models['EPV'] = current_book_value
            
            # Dividend Discount Model (if applicable)
            if dividend_rate > 0 and payout_ratio > 0:
                if payout_ratio < 0.8:
                    dividend_growth = earnings_growth * (1 - payout_ratio)
                else:
                    dividend_growth = 0.02
                
                dividend_growth = np.clip(dividend_growth, 0, 0.15)
                
                if required_return > dividend_growth:
                    next_year_dividend = dividend_rate * (1 + dividend_growth)
                    models['DDM'] = next_year_dividend / (required_return - dividend_growth)
            
            # Weighted ensemble
            if len(models) > 1:
                weights = {'RIM': 0.5, 'EPV': 0.3, 'DDM': 0.2}
            else:
                weights = {list(models.keys())[0]: 1.0}
            
            # Normalize weights to available models
            available_models = list(models.keys())
            total_weight = sum(weights.get(model, 0) for model in available_models)
            normalized_weights = {model: weights.get(model, 0) / total_weight for model in available_models}
            
            # Calculate weighted average
            dcf_intrinsic_value = sum(models[model] * normalized_weights[model] for model in available_models)
            
            # Sanity checks
            dcf_intrinsic_value = np.clip(dcf_intrinsic_value, 
                                        current_book_value * 0.3, 
                                        current_book_value * 5.0)
            
            price_change_pct = (dcf_intrinsic_value - current_price) / current_price * 100
            
            # Calculate confidence
            model_values = list(models.values())
            model_std = np.std(model_values) / np.mean(model_values) if len(model_values) > 1 else 0
            
            if model_std < 0.15 and len(df) >= 4:
                confidence = "High"
            elif model_std < 0.30 and len(df) >= 3:
                confidence = "Medium"
            else:
                confidence = "Low"
            
            result = {
                'method': 'DCF Fundamental',
                'current_price': current_price,
                'predicted_price': dcf_intrinsic_value,
                'price_change_pct': price_change_pct,
                'individual_models': models,
                'model_weights': normalized_weights,
                'confidence': confidence,
                'fundamental_inputs': {
                    'current_roe': current_roe,
                    'book_value_growth': book_value_growth,
                    'earnings_growth': earnings_growth,
                    'required_return': required_return
                },
                'data_points': len(df)
            }
            
            return result, None
            
        except Exception as e:
            return None, f"DCF prediction error: {str(e)}"
    
    def analyze_support_consensus(self, simple_result, ml_result, dcf_result):
        """Analyze support/consensus across methods"""
        try:
            if not simple_result:
                return None, "No base prediction available"
            
            current_price = simple_result['current_price']
            simple_pred = simple_result['predicted_price']
            
            # Determine direction
            if simple_pred > current_price * 1.02:
                direction = 'UP'
            elif simple_pred < current_price * 0.98:
                direction = 'DOWN'
            else:
                direction = 'FLAT'
            
            # Check support from other methods
            supporting_methods = []
            
            # Check ML support
            if ml_result:
                ml_pred = ml_result['predicted_price']
                if direction == 'UP' and ml_pred > current_price * 1.02:
                    supporting_methods.append('ML')
                elif direction == 'DOWN' and ml_pred < current_price * 0.98:
                    supporting_methods.append('ML')
                elif direction == 'FLAT' and current_price * 0.98 <= ml_pred <= current_price * 1.02:
                    supporting_methods.append('ML')
            
            # Check DCF support
            if dcf_result:
                dcf_pred = dcf_result['predicted_price']
                if direction == 'UP' and dcf_pred > current_price * 1.02:
                    supporting_methods.append('DCF')
                elif direction == 'DOWN' and dcf_pred < current_price * 0.98:
                    supporting_methods.append('DCF')
                elif direction == 'FLAT' and current_price * 0.98 <= dcf_pred <= current_price * 1.02:
                    supporting_methods.append('DCF')
            
            # Create consensus prediction
            support_count = len(supporting_methods)
            if support_count == 0:
                consensus_prediction = direction
            elif support_count == 1:
                consensus_prediction = f"{direction}+"
            else:
                consensus_prediction = f"{direction}++"
            
            return {
                'direction': direction,
                'consensus_prediction': consensus_prediction,
                'supporting_methods': supporting_methods,
                'support_count': support_count,
                'confidence_level': 'High' if support_count >= 2 else 'Medium' if support_count == 1 else 'Low'
            }, None
            
        except Exception as e:
            return None, f"Consensus analysis error: {str(e)}"
    
    def get_current_price(self):
        """Get the most recent stock price"""
        try:
            if self.stock_data is not None:
                return self.stock_data['Close'].iloc[-1]
            else:
                ticker_obj = yf.Ticker(self.ticker)
                current_data = ticker_obj.history(period="1d")
                return current_data['Close'].iloc[-1] if not current_data.empty else None
        except:
            return None
    
    def run_complete_analysis(self):
        """Run the complete prediction analysis"""
        print(f"\n{'='*60}")
        print(f"🚀 COMPLETE PREDICTION ANALYSIS FOR {self.ticker}")
        print(f"{'='*60}")
        
        # Step 1: Download data
        if not self.download_stock_data():
            return None
        
        if not self.download_fundamentals():
            return None
        
        # Step 2: Create correlation dataset
        if self.create_correlation_dataset() is None:
            return None
        
        # Step 3: Run all prediction methods
        simple_result, simple_error = self.simple_price_prediction()
        ml_result, ml_error = self.ml_ensemble_prediction()
        dcf_result, dcf_error = self.dcf_fundamental_prediction()
        
        # Step 4: Analyze consensus
        consensus_result, consensus_error = self.analyze_support_consensus(simple_result, ml_result, dcf_result)
        
        # Step 5: Get current price for comparison
        current_market_price = self.get_current_price()
        
        # Step 6: Compile results
        results = {
            'ticker': self.ticker,
            'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'current_market_price': current_market_price,
            'stock_type': simple_result.get('stock_type', 'Unknown') if simple_result else 'Unknown',
            'methods': {
                'simple': simple_result,
                'ml_ensemble': ml_result,
                'dcf_fundamental': dcf_result
            },
            'consensus': consensus_result,
            'errors': {
                'simple': simple_error,
                'ml_ensemble': ml_error,
                'dcf_fundamental': dcf_error,
                'consensus': consensus_error
            }
        }
        
        # Step 7: Display results
        self.display_results(results)
        
        # Step 8: Save results
        self.save_results(results)
        
        return results
    
    def display_results(self, results):
        """Display formatted results"""
        print(f"\n📊 PREDICTION RESULTS FOR {results['ticker']}")
        print("=" * 60)
        
        current_price = results.get('current_market_price')
        if current_price:
            print(f"💰 Current Market Price: ${current_price:.2f}")
        
        stock_type = results.get('stock_type', 'Unknown')
        print(f"📈 Stock Type: {stock_type}")
        
        print(f"\n🎯 INDIVIDUAL METHOD PREDICTIONS:")
        print("-" * 40)
        
        methods = results.get('methods', {})
        
        # Simple method
        if methods.get('simple'):
            simple = methods['simple']
            change_pct = simple.get('price_change_pct', 0)
            print(f"📊 Simple Correlation: ${simple.get('predicted_price', 0):.2f} ({change_pct:+.1f}%)")
            print(f"   Confidence: {simple.get('confidence', 'Unknown')}")
        
        # ML Ensemble
        if methods.get('ml_ensemble'):
            ml = methods['ml_ensemble']
            change_pct = ml.get('price_change_pct', 0)
            print(f"🤖 ML Ensemble: ${ml.get('predicted_price', 0):.2f} ({change_pct:+.1f}%)")
            print(f"   Confidence: {ml.get('confidence', 'Unknown')}")
        
        # DCF
        if methods.get('dcf_fundamental'):
            dcf = methods['dcf_fundamental']
            change_pct = dcf.get('price_change_pct', 0)
            print(f"💎 DCF Fundamental: ${dcf.get('predicted_price', 0):.2f} ({change_pct:+.1f}%)")
            print(f"   Confidence: {dcf.get('confidence', 'Unknown')}")
        
        # Consensus
        consensus = results.get('consensus')
        if consensus:
            print(f"\n🎯 CONSENSUS PREDICTION:")
            print("-" * 30)
            direction = consensus.get('consensus_prediction', 'Unknown')
            supporting = ', '.join(consensus.get('supporting_methods', []))
            confidence = consensus.get('confidence_level', 'Unknown')
            
            # Add emoji based on direction
            if 'UP' in direction:
                emoji = "📈"
            elif 'DOWN' in direction:
                emoji = "📉"
            else:
                emoji = "➡️"
            
            print(f"{emoji} Direction: {direction}")
            print(f"🤝 Supporting Methods: {supporting if supporting else 'None'}")
            print(f"🎯 Confidence Level: {confidence}")
        
        # Errors (if any)
        errors = results.get('errors', {})
        error_count = sum(1 for error in errors.values() if error)
        if error_count > 0:
            print(f"\n⚠️ WARNINGS:")
            for method, error in errors.items():
                if error:
                    print(f"   {method}: {error}")
    
    def save_results(self, results):
        """Save results to files"""
        try:
            # Save detailed JSON results
            json_file = f"{self.results_folder}/{self.ticker}_prediction_results.json"
            with open(json_file, 'w') as f:
                json.dump(results, f, indent=2, default=str)
            
            # Create summary CSV
            summary_data = {
                'Ticker': [self.ticker],
                'Analysis_Date': [results.get('analysis_date')],
                'Current_Price': [results.get('current_market_price')],
                'Stock_Type': [results.get('stock_type')],
                'Simple_Prediction': [results.get('methods', {}).get('simple', {}).get('predicted_price')],
                'Simple_Change_%': [results.get('methods', {}).get('simple', {}).get('price_change_pct')],
                'ML_Prediction': [results.get('methods', {}).get('ml_ensemble', {}).get('predicted_price')],
                'ML_Change_%': [results.get('methods', {}).get('ml_ensemble', {}).get('price_change_pct')],
                'DCF_Prediction': [results.get('methods', {}).get('dcf_fundamental', {}).get('predicted_price')],
                'DCF_Change_%': [results.get('methods', {}).get('dcf_fundamental', {}).get('price_change_pct')],
                'Consensus_Direction': [results.get('consensus', {}).get('consensus_prediction')],
                'Supporting_Methods': [', '.join(results.get('consensus', {}).get('supporting_methods', []))],
                'Confidence_Level': [results.get('consensus', {}).get('confidence_level')]
            }
            
            summary_df = pd.DataFrame(summary_data)
            csv_file = f"{self.results_folder}/{self.ticker}_prediction_summary.csv"
            summary_df.to_csv(csv_file, index=False)
            
            print(f"\n💾 RESULTS SAVED:")
            print(f"   📄 Detailed: {json_file}")
            print(f"   📊 Summary: {csv_file}")
            
        except Exception as e:
            print(f"⚠️ Error saving results: {e}")


# USAGE FUNCTIONS
def analyze_single_ticker(ticker):
    """Analyze a single ticker - main entry point"""
    system = CompletePredictionSystem(ticker)
    return system.run_complete_analysis()

def analyze_multiple_tickers(tickers):
    """Analyze multiple tickers and create comparison matrix"""
    print(f"\n🔄 ANALYZING {len(tickers)} TICKERS...")
    print("=" * 60)
    
    all_results = []
    
    for i, ticker in enumerate(tickers, 1):
        print(f"\n[{i}/{len(tickers)}] Processing {ticker}...")
        try:
            system = CompletePredictionSystem(ticker)
            result = system.run_complete_analysis()
            if result:
                all_results.append(result)
                print(f"✅ {ticker} completed successfully")
            else:
                print(f"❌ {ticker} failed")
        except Exception as e:
            print(f"❌ {ticker} error: {e}")
    
    # Create comparison matrix
    if all_results:
        create_comparison_matrix(all_results)
    
    return all_results

def create_comparison_matrix(results_list):
    """Create comparison matrix from multiple results"""
    try:
        print(f"\n📊 CREATING COMPARISON MATRIX...")
        
        comparison_data = []
        
        for result in results_list:
            ticker = result.get('ticker')
            methods = result.get('methods', {})
            consensus = result.get('consensus', {})
            
            row = {
                'Ticker': ticker,
                'Type': result.get('stock_type'),
                'Current_Price': result.get('current_market_price'),
                'Simple_Prediction': methods.get('simple', {}).get('predicted_price'),
                'Simple_Change_%': methods.get('simple', {}).get('price_change_pct'),
                'ML_Prediction': methods.get('ml_ensemble', {}).get('predicted_price'),
                'ML_Change_%': methods.get('ml_ensemble', {}).get('price_change_pct'),
                'DCF_Prediction': methods.get('dcf_fundamental', {}).get('predicted_price'),
                'DCF_Change_%': methods.get('dcf_fundamental', {}).get('price_change_pct'),
                'Consensus_Direction': consensus.get('consensus_prediction'),
                'Supporting_Methods': ', '.join(consensus.get('supporting_methods', [])),
                'Confidence': consensus.get('confidence_level')
            }
            comparison_data.append(row)
        
        # Create DataFrame
        comparison_df = pd.DataFrame(comparison_data)
        
        # Save comparison matrix
        os.makedirs('results', exist_ok=True)
        matrix_file = 'results/multi_ticker_comparison_matrix.csv'
        comparison_df.to_csv(matrix_file, index=False)
        
        # Display summary
        print(f"\n📋 COMPARISON MATRIX SUMMARY:")
        print("=" * 60)
        print(f"Total tickers analyzed: {len(comparison_df)}")
        
        # Direction distribution
        if 'Consensus_Direction' in comparison_df.columns:
            direction_counts = comparison_df['Consensus_Direction'].str.replace(r'\+', '', regex=True).value_counts()
            print(f"\nDirection Distribution:")
            for direction, count in direction_counts.items():
                if pd.notna(direction):
                    emoji = "📈" if direction == "UP" else "📉" if direction == "DOWN" else "➡️"
                    print(f"  {emoji} {direction}: {count} stocks")
        
        # Confidence distribution
        if 'Confidence' in comparison_df.columns:
            confidence_counts = comparison_df['Confidence'].value_counts()
            print(f"\nConfidence Distribution:")
            for confidence, count in confidence_counts.items():
                if pd.notna(confidence):
                    print(f"  {confidence}: {count} stocks")
        
        # Top opportunities
        if 'Simple_Change_%' in comparison_df.columns:
            positive_changes = comparison_df[comparison_df['Simple_Change_%'] > 0].sort_values('Simple_Change_%', ascending=False)
            if len(positive_changes) > 0:
                print(f"\n🚀 TOP OPPORTUNITIES (Simple Method):")
                for _, row in positive_changes.head(5).iterrows():
                    print(f"  {row['Ticker']}: {row['Simple_Change_%']:+.1f}% ({row['Consensus_Direction']})")
        
        print(f"\n💾 Comparison matrix saved to: {matrix_file}")
        
        return comparison_df
        
    except Exception as e:
        print(f"❌ Error creating comparison matrix: {e}")
        return None


# MAIN EXECUTION EXAMPLES
if __name__ == "__main__":
    print("🎯 COMPLETE STOCK PREDICTION SYSTEM")
    print("=" * 50)
    
    # Example 1: Single ticker analysis
    print("\n📊 EXAMPLE 1: Single Ticker Analysis")
    print("-" * 40)
    
    # Uncomment to run single ticker analysis
    # ticker = "JPM"  # Change to your desired ticker
    # result = analyze_single_ticker(ticker)
    
    # Example 2: Multiple ticker analysis
    print("\n📊 EXAMPLE 2: Multiple Ticker Analysis")
    print("-" * 40)
    
    # Uncomment to run multiple ticker analysis
    # tickers = ["JPM", "BAC", "WFC", "C", "GS"]  # Change to your desired tickers
    # results = analyze_multiple_tickers(tickers)
    
    print("\n💡 USAGE INSTRUCTIONS:")
    print("-" * 30)
    print("1. For single ticker: analyze_single_ticker('TICKER')")
    print("2. For multiple tickers: analyze_multiple_tickers(['TICKER1', 'TICKER2', ...])")
    print("3. Results are automatically saved to 'results/' folder")
    print("4. Requires internet connection for yfinance data download")
    print("5. Optional: Install scikit-learn and xgboost for ML methods")
    
    print(f"\n📁 FOLDER STRUCTURE:")
    print("├── data/              # Stock price data (CSV)")
    print("├── fundamentals/      # Quarterly fundamental data (JSON)")
    print("└── results/           # Prediction results and summaries")


def quick_prediction(ticker):
    """Quick prediction function - simplified interface"""
    print(f"🚀 Quick Prediction for {ticker.upper()}")
    print("=" * 40)
    
    try:
        result = analyze_single_ticker(ticker)
        
        if result and result.get('consensus'):
            consensus = result['consensus']
            direction = consensus.get('consensus_prediction', 'Unknown')
            confidence = consensus.get('confidence_level', 'Unknown')
            supporting = ', '.join(consensus.get('supporting_methods', []))
            
            # Simple summary
            print(f"\n🎯 QUICK SUMMARY:")
            print(f"Direction: {direction}")
            print(f"Confidence: {confidence}")
            print(f"Support: {supporting if supporting else 'None'}")
            
            return direction, confidence, supporting
        else:
            print("❌ Unable to generate prediction")
            return None, None, None
            
    except Exception as e:
        print(f"❌ Error: {e}")
        return None, None, None


# Advanced usage examples
def batch_analyze_from_file(filename):
    """Analyze tickers from a text file (one ticker per line)"""
    try:
        with open(filename, 'r') as f:
            tickers = [line.strip().upper() for line in f if line.strip()]
        
        print(f"📂 Loaded {len(tickers)} tickers from {filename}")
        return analyze_multiple_tickers(tickers)
        
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return None

def update_existing_analysis(ticker):
    """Update analysis for a ticker with fresh data"""
    print(f"🔄 Updating analysis for {ticker}...")
    
    # This will automatically download fresh data
    return analyze_single_ticker(ticker)

🎯 COMPLETE STOCK PREDICTION SYSTEM

📊 EXAMPLE 1: Single Ticker Analysis
----------------------------------------

📊 EXAMPLE 2: Multiple Ticker Analysis
----------------------------------------

💡 USAGE INSTRUCTIONS:
------------------------------
1. For single ticker: analyze_single_ticker('TICKER')
2. For multiple tickers: analyze_multiple_tickers(['TICKER1', 'TICKER2', ...])
3. Results are automatically saved to 'results/' folder
4. Requires internet connection for yfinance data download
5. Optional: Install scikit-learn and xgboost for ML methods

📁 FOLDER STRUCTURE:
├── data/              # Stock price data (CSV)
├── fundamentals/      # Quarterly fundamental data (JSON)
└── results/           # Prediction results and summaries


In [18]:
direction, confidence, support = quick_prediction("GOOGL")
print(f"Prediction: {direction} | Confidence: {confidence}")

🚀 Quick Prediction for GOOGL
🎯 Initializing Complete Prediction System for GOOGL

🚀 COMPLETE PREDICTION ANALYSIS FOR GOOGL
📊 Downloading stock data for GOOGL...
✅ Stock data saved to data/GOOGL_stock_data.csv
📈 Downloading fundamentals for GOOGL...
✅ Fundamentals saved: 5 quarters available
🔗 Creating correlation dataset...
❌ Error creating correlation dataset: Invalid comparison between dtype=datetime64[ns, America/New_York] and Timestamp
❌ Unable to generate prediction
Prediction: None | Confidence: None
