# 📈 Multibagger Stock Prediction Pipeline
## Using EV × L Framework + Advanced ML Models

**Goal**: Predict stocks with 2x+ returns in 3 years using fundamental analysis + machine learning

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import xgboost as xgb
from tensorflow import keras
import shap
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

## 📊 Step 1: Data Loading and Exploration

In [None]:
# Load the MultibaggerPredictor class
from multibagger_prediction import MultibaggerPredictor

# Initialize predictor
predictor = MultibaggerPredictor()

# Load synthetic data
df = predictor.load_data(generate_synthetic=True)
print(f"Dataset shape: {df.shape}")
print(f"Multibagger ratio: {df['multibagger'].mean():.2%}")

# Display first few rows
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Class distribution
plt.figure(figsize=(8, 5))
df['multibagger'].value_counts().plot(kind='bar')
plt.title('Class Distribution: Multibagger vs Non-Multibagger')
plt.xlabel('Multibagger (1=Yes, 0=No)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 🔍 Step 2: EV × L Framework Implementation

In [None]:
# Preprocess data
df_clean = predictor.preprocess_data(df)
print(f"After preprocessing: {df_clean.shape}")

# Compute EV and L scores
df_scored = predictor.compute_ev_l_scores(df_clean)

# Display EV and L score distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# EV Score distribution
axes[0].hist(df_scored['EV_Score'], bins=30, alpha=0.7, color='blue')
axes[0].axvline(0.6, color='red', linestyle='--', label='Threshold (0.6)')
axes[0].set_title('EV Score Distribution')
axes[0].set_xlabel('EV Score')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# L Score distribution
axes[1].hist(df_scored['L_Score'], bins=30, alpha=0.7, color='green')
axes[1].axvline(0.5, color='red', linestyle='--', label='Threshold (0.5)')
axes[1].set_title('L Score Distribution')
axes[1].set_xlabel('L Score')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# EV vs L scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(df_scored['EV_Score'], df_scored['L_Score'], 
                     c=df_scored['multibagger'], cmap='viridis', alpha=0.6)
plt.axvline(0.6, color='red', linestyle='--', alpha=0.7, label='EV Threshold')
plt.axhline(0.5, color='red', linestyle='--', alpha=0.7, label='L Threshold')
plt.xlabel('EV Score (Earnings Visibility)')
plt.ylabel('L Score (Longevity)')
plt.title('EV × L Framework: Stock Distribution')
plt.colorbar(scatter, label='Multibagger (1=Yes, 0=No)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Filter candidates
df_filtered = predictor.filter_candidates(df_scored)
print(f"\nFiltered dataset shape: {df_filtered.shape}")
print(f"Multibagger ratio in filtered data: {df_filtered['multibagger'].mean():.2%}")

## ⚙️ Step 3: Feature Engineering and Data Preparation

In [None]:
# Prepare features
X, y = predictor.prepare_features(df_filtered)

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Time-based split (simulating chronological order)
split_idx = int(0.7 * len(X))
val_idx = int(0.85 * len(X))

X_train, y_train = X.iloc[:split_idx], y.iloc[:split_idx]
X_val, y_val = X.iloc[split_idx:val_idx], y.iloc[split_idx:val_idx]
X_test, y_test = X.iloc[val_idx:], y.iloc[val_idx:]

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

# Scale features
X_train_scaled = predictor.scaler.fit_transform(X_train)
X_val_scaled = predictor.scaler.transform(X_val)
X_test_scaled = predictor.scaler.transform(X_test)

print("\n✅ Data preparation completed!")

## 🤖 Step 4: Model Training

In [None]:
# Train all models
models = predictor.train_models(X_train_scaled, y_train, X_val_scaled, y_val)
print(f"\n✅ Trained {len(models)} models successfully!")

## 📈 Step 5: Model Evaluation

In [None]:
# Evaluate all models
results_df = predictor.evaluate_models(X_test_scaled, y_test)

# Display results in a nice format
print("\n📊 Model Performance Comparison:")
print("=" * 80)
for _, row in results_df.iterrows():
    print(f"{row['Model']:<20} | Accuracy: {row['Accuracy']:.3f} | Precision: {row['Precision']:.3f} | Recall: {row['Recall']:.3f} | F1: {row['F1']:.3f} | AUC: {row['AUC']:.3f}")

# Plot model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    results_df.plot(x='Model', y=metric, kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'{metric} Comparison')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 🎯 Step 6: Advanced Analysis

In [None]:
# Generate comprehensive visualizations
predictor.plot_results(df_filtered, X_test_scaled, y_test)

In [None]:
# SHAP analysis for feature importance
if 'Random Forest' in predictor.models:
    print("🔍 SHAP Analysis for Random Forest Model:")
    
    # Create SHAP explainer
    rf_model = predictor.models['Random Forest']
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test_scaled[:100])  # Use first 100 samples
    
    # Feature importance plot
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values[1], X_test.iloc[:100], plot_type="bar", show=False)
    plt.title('SHAP Feature Importance (Random Forest)')
    plt.tight_layout()
    plt.show()
    
    # Summary plot
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values[1], X_test.iloc[:100], show=False)
    plt.title('SHAP Summary Plot (Random Forest)')
    plt.tight_layout()
    plt.show()

## 💾 Step 7: Model Saving and Deployment

In [None]:
# Save the best model
predictor.save_model('best_multibagger_model.pkl')

# Test prediction on new data
sample_stocks = [
    {
        'name': 'High Growth Tech Stock',
        'PE_Ratio': 25.0,
        'EPS': 12.5,
        'ROE': 28.0,
        'ROA': 15.2,
        'Market_Cap': 10000000000,
        'Revenue_Growth': 35.0,
        'Profit_Margin': 22.5,
        'Debt_Equity': 0.2,
        'Volatility': 0.4,
        'SectorGrowth': 18.0
    },
    {
        'name': 'Stable Value Stock',
        'PE_Ratio': 12.0,
        'EPS': 8.0,
        'ROE': 18.0,
        'ROA': 10.5,
        'Market_Cap': 5000000000,
        'Revenue_Growth': 8.0,
        'Profit_Margin': 12.0,
        'Debt_Equity': 0.3,
        'Volatility': 0.2,
        'SectorGrowth': 6.0
    },
    {
        'name': 'Risky Penny Stock',
        'PE_Ratio': 45.0,
        'EPS': 2.0,
        'ROE': 8.0,
        'ROA': 3.0,
        'Market_Cap': 100000000,
        'Revenue_Growth': 50.0,
        'Profit_Margin': 5.0,
        'Debt_Equity': 1.2,
        'Volatility': 0.8,
        'SectorGrowth': 25.0
    }
]

print("🔮 Sample Predictions:")
print("=" * 60)

for stock in sample_stocks:
    name = stock.pop('name')
    probability = predictor.predict_new_stock(stock)
    
    print(f"\n📊 {name}:")
    print(f"   Multibagger Probability: {probability:.1%}")
    
    if probability >= 0.7:
        print(f"   Recommendation: 🚀 STRONG BUY")
    elif probability >= 0.5:
        print(f"   Recommendation: ⚡ MODERATE BUY")
    else:
        print(f"   Recommendation: ⚠️ AVOID/HOLD")

## 🎉 Step 8: Summary and Next Steps

In [None]:
print("🎯 MULTIBAGGER PREDICTION PIPELINE SUMMARY")
print("=" * 50)
print(f"📊 Total stocks analyzed: {len(df)}")
print(f"🎯 Stocks passing EV×L filter: {len(df_filtered)} ({len(df_filtered)/len(df)*100:.1f}%)")
print(f"🤖 Models trained: {len(predictor.models)}")
print(f"🏆 Best model: {results_df.loc[results_df['F1'].idxmax(), 'Model']}")
print(f"📈 Best F1 Score: {results_df['F1'].max():.3f}")
print(f"💾 Model saved: best_multibagger_model.pkl")

print("\n🚀 NEXT STEPS:")
print("1. Run Streamlit app: streamlit run streamlit_app.py")
print("2. Test with real stock data from Yahoo Finance")
print("3. Implement backtesting on historical data")
print("4. Add more sophisticated features (technical indicators, sentiment)")
print("5. Deploy to cloud platform for production use")

print("\n✅ Pipeline completed successfully!")