In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

class StockPredictor:
    def __init__(self, symbol='AAPL'):
        self.symbol = symbol
        self.data = None
        self.features = None
        self.target = None
        self.models = {}
        self.results = {}
    
    def collect_data(self):
        """Collect 5 years of historical OHLC stock data"""
        print(f"Collecting 5 years of data for {self.symbol}...")
        
        # Download 5 years of data
        ticker = yf.Ticker(self.symbol)
        self.data = ticker.history(period='5y')
        
        # Clean data
        self.data = self.data.dropna()
        
        print(f" Collected {len(self.data)} trading days")
        print(f" Date range: {self.data.index[0].date()} to {self.data.index[-1].date()}")
        
        return self.data
    
    def create_features(self):
        """Create technical indicators and features"""
        print("Creating features...")
        
        df = self.data.copy()
        
        # Target: Next day returns
        df['Returns'] = df['Close'].pct_change()
        df['Target'] = df['Returns'].shift(-1)  # Predict next day return
        
        # Price features
        df['Price_Change'] = (df['Close'] - df['Open']) / df['Open']
        df['High_Low_Ratio'] = (df['High'] - df['Low']) / df['Close']
        df['Volume_Ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
        
        # Moving averages
        for window in [5, 10, 20, 50]:
            df[f'SMA_{window}'] = df['Close'].rolling(window).mean()
            df[f'Price_SMA_{window}'] = df['Close'] / df[f'SMA_{window}']
        
        # Technical indicators
        df['RSI'] = self.calculate_rsi(df['Close'])
        df['MACD'] = self.calculate_macd(df['Close'])
        df['BB_Position'] = self.calculate_bollinger_position(df['Close'])
        
        # Momentum features
        for period in [1, 3, 5, 10]:
            df[f'Momentum_{period}'] = df['Close'] / df['Close'].shift(period) - 1
        
        # Volatility
        df['Volatility'] = df['Returns'].rolling(10).std()
        
        # Lagged returns
        for lag in [1, 2, 3, 5]:
            df[f'Returns_Lag_{lag}'] = df['Returns'].shift(lag)
        
        # Volume indicators
        df['Volume_Change'] = df['Volume'].pct_change()
        df['Price_Volume'] = df['Price_Change'] * df['Volume_Ratio']
        
        self.data = df
        print(f" Created {len([col for col in df.columns if col not in ['Open', 'High', 'Low', 'Close', 'Volume']])} features")
    
    def calculate_rsi(self, prices, window=14):
        """Calculate RSI indicator"""
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    def calculate_macd(self, prices):
        """Calculate MACD indicator"""
        ema12 = prices.ewm(span=12).mean()
        ema26 = prices.ewm(span=26).mean()
        return ema12 - ema26
    
    def calculate_bollinger_position(self, prices, window=20):
        """Calculate position within Bollinger Bands"""
        sma = prices.rolling(window).mean()
        std = prices.rolling(window).std()
        upper = sma + (std * 2)
        lower = sma - (std * 2)
        return (prices - lower) / (upper - lower)
    
    def prepare_data(self):
        """Prepare features and target for ML"""
        print("Preparing data for ML...")
        
        # Select feature columns
        feature_cols = [col for col in self.data.columns 
                       if col not in ['Open', 'High', 'Low', 'Close', 'Volume', 
                                    'Returns', 'Target'] and 'SMA_' not in col]
        
        # Create feature matrix
        self.features = self.data[feature_cols].copy()
        self.target = self.data['Target'].copy()
        
        # Remove NaN values
        mask = ~(self.features.isnull().any(axis=1) | self.target.isnull())
        self.features = self.features[mask]
        self.target = self.target[mask]
        
        print(f" Features shape: {self.features.shape}")
        print(f" Target shape: {self.target.shape}")
        
        return self.features, self.target
    
    def split_data(self, test_days=60):
        """Split data into train/test with last 60 days as test"""
        print(f"Splitting data - last {test_days} days as test set...")
        
        split_idx = len(self.features) - test_days
        
        X_train = self.features.iloc[:split_idx]
        X_test = self.features.iloc[split_idx:]
        y_train = self.target.iloc[:split_idx]
        y_test = self.target.iloc[split_idx:]
        
        print(f" Train set: {len(X_train)} samples")
        print(f" Test set: {len(X_test)} samples")
        
        return X_train, X_test, y_train, y_test
    
    def train_random_forest(self, X_train, y_train):
        """Train Random Forest with hyperparameter tuning"""
        print("Training Random Forest...")
        
        # Parameter grid
        rf_params = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=3)
        
        rf = RandomForestRegressor(random_state=42)
        rf_grid = GridSearchCV(rf, rf_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        rf_grid.fit(X_train, y_train)
        
        self.models['Random Forest'] = rf_grid.best_estimator_
        
        print(f"✓ Best RF params: {rf_grid.best_params_}")
        return rf_grid.best_estimator_
    
    def train_xgboost(self, X_train, y_train):
        """Train XGBoost with hyperparameter tuning"""
        print("Training XGBoost...")
        
        # Parameter grid
        xgb_params = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 0.9, 1.0]
        }
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=3)
        
        xgb_model = xgb.XGBRegressor(random_state=42)
        xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        xgb_grid.fit(X_train, y_train)
        
        self.models['XGBoost'] = xgb_grid.best_estimator_
        
        print(f"✓ Best XGBoost params: {xgb_grid.best_params_}")
        return xgb_grid.best_estimator_
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate models and calculate directional accuracy"""
        print("\nEvaluating models...")
        
        for model_name, model in self.models.items():
            # Predictions
            y_pred = model.predict(X_test)
            
            # Regression metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            
            # Directional accuracy (key metric for trading)
            actual_direction = np.sign(y_test)
            predicted_direction = np.sign(y_pred)
            directional_accuracy = accuracy_score(actual_direction, predicted_direction)
            
            # Store results
            self.results[model_name] = {
                'predictions': y_pred,
                'rmse': rmse,
                'directional_accuracy': directional_accuracy
            }
            
            print(f"\n{model_name} Results:")
            print(f"  RMSE: {rmse:.6f}")
            print(f"  Directional Accuracy: {directional_accuracy:.2%}")
    
    def plot_results(self, X_test, y_test):
        """Plot prediction results"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Plot 1: Actual vs Predicted
        ax1 = axes[0, 0]
        for model_name in self.models.keys():
            y_pred = self.results[model_name]['predictions']
            ax1.scatter(y_test, y_pred, alpha=0.6, label=model_name)
        
        ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        ax1.set_xlabel('Actual Returns')
        ax1.set_ylabel('Predicted Returns')
        ax1.set_title('Actual vs Predicted Returns')
        ax1.legend()
        ax1.grid(True)
        
        # Plot 2: Time series
        ax2 = axes[0, 1]
        test_dates = self.features.index[-len(y_test):]
        ax2.plot(test_dates, y_test.values, label='Actual', linewidth=2)
        
        for model_name in self.models.keys():
            y_pred = self.results[model_name]['predictions']
            ax2.plot(test_dates, y_pred, label=f'{model_name}', alpha=0.8)
        
        ax2.set_title('Returns Over Time (Test Period)')
        ax2.legend()
        ax2.grid(True)
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
        
        # Plot 3: Directional Accuracy
        ax3 = axes[1, 0]
        models = list(self.results.keys())
        accuracies = [self.results[model]['directional_accuracy'] for model in models]
        
        bars = ax3.bar(models, accuracies, color=['skyblue', 'lightcoral'])
        ax3.set_ylabel('Directional Accuracy')
        ax3.set_title('Model Comparison')
        ax3.set_ylim(0, 1)
        
        for bar, acc in zip(bars, accuracies):
            ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{acc:.1%}', ha='center', fontweight='bold')
        
        # Plot 4: Feature Importance
        ax4 = axes[1, 1]
        if 'Random Forest' in self.models:
            rf_model = self.models['Random Forest']
            importance = pd.DataFrame({
                'feature': self.features.columns,
                'importance': rf_model.feature_importances_
            }).sort_values('importance', ascending=True).tail(10)
            
            ax4.barh(importance['feature'], importance['importance'])
            ax4.set_title('Top 10 Feature Importance (RF)')
            ax4.grid(True)
        
        plt.tight_layout()
        plt.show()
    
    def run_prediction_pipeline(self):
        """Run complete ML pipeline"""
        print("="*50)
        print("MACHINE LEARNING STOCK PRICE PREDICTOR")
        print("="*50)
        
        # 1. Collect data
        self.collect_data()
        
        # 2. Create features
        self.create_features()
        
        # 3. Prepare data
        self.prepare_data()
        
        # 4. Split data
        X_train, X_test, y_train, y_test = self.split_data(test_days=60)
        
        # 5. Train models
        self.train_random_forest(X_train, y_train)
        self.train_xgboost(X_train, y_train)
        
        # 6. Evaluate models
        self.evaluate_models(X_test, y_test)
        
        # 7. Plot results
        self.plot_results(X_test, y_test)
        
        # 8. Summary
        print("\n" + "="*50)
        print("FINAL RESULTS")
        print("="*50)
        
        for model_name, results in self.results.items():
            print(f"{model_name}:")
            print(f"  Directional Accuracy: {results['directional_accuracy']:.1%}")
            print(f"  RMSE: {results['rmse']:.6f}")
        
        best_model = max(self.results.keys(), 
                        key=lambda x: self.results[x]['directional_accuracy'])
        best_accuracy = self.results[best_model]['directional_accuracy']
        
        print(f"\n Best Model: {best_model}")
        print(f" Best Directional Accuracy: {best_accuracy:.1%}")
        
        return self.results


# Usage Example
if __name__ == "__main__":
    # Create predictor for Apple stock
    predictor = StockPredictor('AAPL')
    
    # Run complete pipeline
    results = predictor.run_prediction_pipeline()


# Test different stocks
def test_multiple_stocks():
    """Test predictor on multiple stocks"""
    stocks = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']
    results_summary = {}
    
    for stock in stocks:
        print(f"\n{'='*20} TESTING {stock} {'='*20}")
        
        try:
            predictor = StockPredictor(stock)
            results = predictor.run_prediction_pipeline()
            
            # Get best accuracy
            best_accuracy = max([r['directional_accuracy'] for r in results.values()])
            results_summary[stock] = best_accuracy
            
        except Exception as e:
            print(f"Error with {stock}: {e}")
            results_summary[stock] = None
    
    # Print summary
    print("\n" + "="*40)
    print("MULTI-STOCK RESULTS SUMMARY")
    print("="*40)
    
    for stock, accuracy in results_summary.items():
        if accuracy:
            print(f"{stock}: {accuracy:.1%}")

# Uncomment to test multiple stocks
# test_multiple_stocks()

MACHINE LEARNING STOCK PRICE PREDICTOR
Collecting 5 years of data for AAPL...
 Collected 1255 trading days
 Date range: 2020-08-13 to 2025-08-12
Creating features...
 Created 29 features
Preparing data for ML...
 Features shape: (1235, 19)
 Target shape: (1235,)
Splitting data - last 60 days as test set...
 Train set: 1175 samples
 Test set: 60 samples
Training Random Forest...
✓ Best RF params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Training XGBoost...


In [3]:
pip install xgboost yfinance pandas scikit-learn matplotlib

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 872.4 kB/s eta 0:01:05
    --------------------------------------- 0.8/56.8 MB 1.0 MB/s eta 0:00:55
    --------------------------------------- 1.0/56.8 MB 1.1 MB/s eta 0:00:52
    --------------------------------------- 1.3/56.8 MB 1.1 MB/s eta 0:00:50
   - -------------------------------------- 1.6/56.8 MB 1.1 MB/s eta 0:00:49
   - -------------------------------------- 1.8/56.8 MB 1.1 MB/s eta 0:00:50
   - -------------------------------------- 2.1/56.8 MB 1.1 MB/s eta 0:00:50
   - ---------------------------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\path\to\your\directory\anaconda\New folder\python.exe -m pip install --upgrade pip
