In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

class StockViz:
    """
    Class for creating visualizations on stock market data
    """
    
    def __init__(self, data_dir='data', output_dir='eda'):
        self.data_dir = data_dir
        self.output_dir = output_dir
        self.stocks_data = {}
        
        # create output directory
        os.makedirs(output_dir, exist_ok=True)
        
    def load_data(self):
        """
        Load all stock data
        """
        print("Loading stock data")
        files = [f for f in os.listdir(self.data_dir) if f.endswith('.csv')]
        
        for file in files:
            stock = file.replace('.csv', '')
            df = pd.read_csv(os.path.join(self.data_dir, file), 
                           index_col=0, parse_dates=True)
            self.stocks_data[stock] = df
        
        print(f"\nTotal stocks loaded: {len(self.stocks_data)}")
        return self
    
    def plot_price_history(self, stocks=None, figsize=(15, 8)):
        """
        Plot closing price history for selected stocks
        """
        if stocks is None:
            # top performers + S&P 500 benchmark
            stocks = ['^GSPC', 'NVDA', 'TSLA', 'AVGO', 'AAPL', 'NFLX']
        
        fig, ax = plt.subplots(figsize=figsize)
        
        for stock in stocks:
            if stock in self.stocks_data:
                df = self.stocks_data[stock]
                # Normalize to start at 100
                normalized = (df['Close'] / df['Close'].iloc[0]) * 100
                ax.plot(normalized.index, normalized, label=stock, linewidth=2)
        
        ax.set_xlabel('Date', fontsize=12)
        ax.set_ylabel('Normalized Price (Base=100)', fontsize=12)
        ax.set_title('Stock Price History - Top Performers (Normalized)', fontsize=14, fontweight='bold')
        ax.legend(loc='best', fontsize=10)
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        
        filename = f'{self.output_dir}/price_history_normalized.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def plot_correlation_matrix(self, stocks=None, figsize=(12, 10)):
        """
        Plot correlation matrix of returns
        """
        if stocks is None:
            # mix of top performers and different sectors
            stocks = ['^GSPC',     # Benchmark 
                      'NVDA',      # Technology (top performer) 
                      'BLK',       # Financial 
                      'UNH',       # Healthcare 
                      'TSLA',      # Consumer Discretionary
                      'PEP',       # Consumer Staples
                      'CAT',       # Industrial 
                      'COP',       # Energy 
                      'NFLX',      # Communication 
                      'NEE',       # Utilities 
                      'PLD']       # Real Estate 
        
        # calculate returns
        returns_df = pd.DataFrame()
        for stock in stocks:
            if stock in self.stocks_data:
                df = self.stocks_data[stock].copy()
                returns_df[stock] = df['Close'].pct_change()
        
        # calculate correlation
        corr_matrix = returns_df.corr()
        
        # plot
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                   center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
        ax.set_title('Correlation Matrix of Daily Returns', fontsize=14, fontweight='bold', pad=20)
        plt.tight_layout()
        
        filename = f'{self.output_dir}/correlation_matrix.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def plot_sector_comparison(self, figsize=(20, 10)):
        """
        Compare performance across ALL sectors
        """
        sectors = {
            'Technology': ['AAPL', 'MSFT', 'NVDA', 'GOOGL', 'META', 'AVGO'],
            'Financials': ['JPM', 'BAC', 'GS', 'MS', 'BLK', 'AXP'],
            'Healthcare': ['UNH', 'JNJ', 'LLY', 'PFE', 'MRK', 'TMO'],
            'Consumer Discretionary': ['AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX'],
            'Consumer Staples': ['WMT', 'PG', 'KO', 'PEP'],
            'Industrials': ['BA', 'CAT', 'UPS', 'RTX', 'HON'],
            'Energy': ['XOM', 'CVX', 'COP'],
            'Communication Services': ['DIS', 'NFLX', 'CMCSA']
        }
        
        fig, axes = plt.subplots(2, 4, figsize=figsize)
        axes = axes.flatten()
        
        for idx, (sector, stocks) in enumerate(sectors.items()):
            for stock in stocks:
                if stock in self.stocks_data:
                    df = self.stocks_data[stock]
                    normalized = (df['Close'] / df['Close'].iloc[0]) * 100
                    axes[idx].plot(normalized.index, normalized, label=stock, linewidth=1.5, alpha=0.8)
            
            axes[idx].set_xlabel('Date', fontsize=9)
            axes[idx].set_ylabel('Normalized Price (Base=100)', fontsize=9)
            axes[idx].set_title(f'{sector}', fontsize=11, fontweight='bold')
            axes[idx].legend(loc='best', fontsize=8, ncol=2)
            axes[idx].grid(True, alpha=0.3)
            axes[idx].tick_params(labelsize=8)
        
        plt.suptitle('Performance Comparison Across All Sectors (2000-2024)', 
                     fontsize=14, fontweight='bold', y=0.995)
        plt.tight_layout()
        
        filename = f'{self.output_dir}/sector_comparison_all.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def plot_sector_summary_stats(self, figsize=(14, 10)):
        """
        Create summary statistics comparison across sectors
        """
        sectors = {
            'Technology': ['AAPL', 'MSFT', 'NVDA', 'GOOGL', 'META', 'AVGO'],
            'Financials': ['JPM', 'BAC', 'GS', 'MS', 'BLK', 'AXP'],
            'Healthcare': ['UNH', 'JNJ', 'LLY', 'PFE', 'MRK', 'TMO'],
            'Consumer Discretionary': ['AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX'],
            'Consumer Staples': ['WMT', 'PG', 'KO', 'PEP'],
            'Industrials': ['BA', 'CAT', 'UPS', 'RTX', 'HON'],
            'Energy': ['XOM', 'CVX', 'COP'],
            'Communication Services': ['DIS', 'NFLX', 'CMCSA']
        }
        
        sector_stats = []
        
        for sector, stocks in sectors.items():
            returns = []
            volatilities = []
            sharpes = []
            
            for stock in stocks:
                if stock in self.stocks_data:
                    df = self.stocks_data[stock]
                    daily_returns = df['Close'].pct_change().dropna()
                    
                    ann_return = ((1 + daily_returns.mean())**252 - 1) * 100
                    ann_vol = daily_returns.std() * np.sqrt(252) * 100
                    sharpe = (ann_return - 4.0) / ann_vol if ann_vol > 0 else 0
                    
                    returns.append(ann_return)
                    volatilities.append(ann_vol)
                    sharpes.append(sharpe)
            
            sector_stats.append({
                'Sector': sector,
                'Avg Return': np.mean(returns),
                'Avg Volatility': np.mean(volatilities),
                'Avg Sharpe': np.mean(sharpes),
                'Max Return': np.max(returns),
                'Min Return': np.min(returns)
            })
        
        stats_df = pd.DataFrame(sector_stats)
        
        fig, axes = plt.subplots(2, 2, figsize=figsize)
        
        # average return by sector
        axes[0, 0].barh(stats_df['Sector'], stats_df['Avg Return'], color='steelblue', alpha=0.7)
        axes[0, 0].set_xlabel('Average Annualized Return (%)', fontsize=10)
        axes[0, 0].set_title('Average Return by Sector', fontsize=12, fontweight='bold')
        axes[0, 0].grid(True, alpha=0.3, axis='x')
        
        # average volatility by sector
        axes[0, 1].barh(stats_df['Sector'], stats_df['Avg Volatility'], color='coral', alpha=0.7)
        axes[0, 1].set_xlabel('Average Annualized Volatility (%)', fontsize=10)
        axes[0, 1].set_title('Average Volatility by Sector', fontsize=12, fontweight='bold')
        axes[0, 1].grid(True, alpha=0.3, axis='x')
        
        # average sharpe ratio by sector
        axes[1, 0].barh(stats_df['Sector'], stats_df['Avg Sharpe'], color='seagreen', alpha=0.7)
        axes[1, 0].set_xlabel('Average Sharpe Ratio', fontsize=10)
        axes[1, 0].set_title('Risk-Adjusted Returns by Sector', fontsize=12, fontweight='bold')
        axes[1, 0].axvline(x=0, color='black', linestyle='-', linewidth=0.8)
        axes[1, 0].grid(True, alpha=0.3, axis='x')
        
        # return range by sector (min to max)
        y_pos = np.arange(len(stats_df))
        axes[1, 1].barh(y_pos, stats_df['Max Return'] - stats_df['Min Return'], 
                        left=stats_df['Min Return'], color='mediumpurple', alpha=0.7)
        axes[1, 1].set_yticks(y_pos)
        axes[1, 1].set_yticklabels(stats_df['Sector'])
        axes[1, 1].set_xlabel('Return Range (%)', fontsize=10)
        axes[1, 1].set_title('Return Dispersion Within Sectors', fontsize=12, fontweight='bold')
        axes[1, 1].axvline(x=0, color='black', linestyle='-', linewidth=0.8)
        axes[1, 1].grid(True, alpha=0.3, axis='x')
        
        plt.tight_layout()
        
        filename = f'{self.output_dir}/sector_summary_stats.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self

    def plot_risk_return_scatter(self, figsize=(12, 8)):
        """
        Plot risk-return scatter for all stocks
        """
        returns = []
        volatilities = []
        labels = []
        
        for stock, df in self.stocks_data.items():
            daily_returns = df['Close'].pct_change().dropna()
            ann_return = ((1 + daily_returns.mean())**252 - 1) * 100
            ann_vol = daily_returns.std() * np.sqrt(252) * 100
            
            returns.append(ann_return)
            volatilities.append(ann_vol)
            labels.append(stock)
        
        fig, ax = plt.subplots(figsize=figsize)
        scatter = ax.scatter(volatilities, returns, s=100, alpha=0.6, c=returns, cmap='RdYlGn')
        
        # add labels for notable stocks
        notable = ['^GSPC', 'NVDA', 'NFLX', 'AAPL', 'TSLA', 'AVGO', 
                   'INTC', 'CSCO', 'PFE', 'CMCSA', 'BAC']
        for i, label in enumerate(labels):
            if label in notable:
                ax.annotate(label, (volatilities[i], returns[i]), 
                          fontsize=9, fontweight='bold')
        
        ax.set_xlabel('Annualized Volatility (%)', fontsize=12)
        ax.set_ylabel('Annualized Return (%)', fontsize=12)
        ax.set_title('Risk-Return Profile', fontsize=14, fontweight='bold')
        ax.grid(True, alpha=0.3)
        plt.colorbar(scatter, label='Return (%)')
        plt.tight_layout()
        
        filename = f'{self.output_dir}/risk_return_scatter.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def plot_top_vs_worst(self, figsize=(15, 8)):
        """
        Compare top performers vs worst performers
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
        
        # top performers
        top_performers = ['NVDA', 'NFLX', 'AAPL', 'TSLA', 'AVGO']
        for stock in top_performers:
            if stock in self.stocks_data:
                df = self.stocks_data[stock]
                normalized = (df['Close'] / df['Close'].iloc[0]) * 100
                ax1.plot(normalized.index, normalized, label=stock, linewidth=2, alpha=0.8)
        
        ax1.set_xlabel('Date', fontsize=11)
        ax1.set_ylabel('Normalized Price (Base=100)', fontsize=11)
        ax1.set_title('Top 5 Performers', fontsize=12, fontweight='bold')
        ax1.legend(loc='best', fontsize=9)
        ax1.grid(True, alpha=0.3)
        ax1.set_yscale('log')
        
        # worst performers 
        worst_performers = ['INTC', 'CSCO', 'PFE', 'CMCSA', 'BAC']
        for stock in worst_performers:
            if stock in self.stocks_data:
                df = self.stocks_data[stock]
                normalized = (df['Close'] / df['Close'].iloc[0]) * 100
                ax2.plot(normalized.index, normalized, label=stock, linewidth=2, alpha=0.8)
        
        ax2.set_xlabel('Date', fontsize=11)
        ax2.set_ylabel('Normalized Price (Base=100)', fontsize=11)
        ax2.set_title('Bottom 5 Performers', fontsize=12, fontweight='bold')
        ax2.legend(loc='best', fontsize=9)
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        filename = f'{self.output_dir}/top_vs_worst_performers.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def plot_sharpe_comparison(self, figsize=(14, 8)):
        """
        Compare Sharpe ratios across stocks
        """
        sharpe_data = []
        
        for stock, df in self.stocks_data.items():
            daily_returns = df['Close'].pct_change().dropna()
            ann_return = ((1 + daily_returns.mean())**252 - 1) * 100
            ann_vol = daily_returns.std() * np.sqrt(252) * 100
            sharpe = (ann_return - 4.0) / ann_vol if ann_vol > 0 else 0
            
            sharpe_data.append({
                'Stock': stock,
                'Sharpe_Ratio': sharpe
            })
        
        sharpe_df = pd.DataFrame(sharpe_data)
        sharpe_df = sharpe_df.sort_values('Sharpe_Ratio', ascending=True)
        
        # plot top 15 and bottom 5
        top_bottom = pd.concat([
            sharpe_df.head(5),
            sharpe_df.tail(15)
        ])
        
        fig, ax = plt.subplots(figsize=figsize)
        colors = ['red' if x < 0 else 'green' for x in top_bottom['Sharpe_Ratio']]
        ax.barh(range(len(top_bottom)), top_bottom['Sharpe_Ratio'], color=colors, alpha=0.7)
        ax.set_yticks(range(len(top_bottom)))
        ax.set_yticklabels(top_bottom['Stock'])
        ax.set_xlabel('Sharpe Ratio', fontsize=12)
        ax.set_title('Risk-Adjusted Returns (Sharpe Ratio, 4% Risk-Free Rate)', 
                     fontsize=14, fontweight='bold')
        ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
        ax.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        
        filename = f'{self.output_dir}/sharpe_ratio_comparison.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"Saved: {filename}")
        plt.close()
        
        return self
    
    def generate_viz(self):
        """
        Generate all visualizations
        """
        print("\n" + "="*80)
        print("GENERATING VISUALIZATIONS")
        print("="*80 + "\n")
        
        print("1. Price History (Top Performers + Benchmark)")
        self.plot_price_history() 
        
        print("\n2. Correlation Matrix")
        self.plot_correlation_matrix()
        
        print("\n3. Sector Comparison (All Sectors)")
        self.plot_sector_comparison()
        
        print("\n4. Sector Summary Statistics")
        self.plot_sector_summary_stats()
        
        print("\n5. Risk-Return Scatter")
        self.plot_risk_return_scatter()
        
        print("\n6. Top vs Worst Performers")
        self.plot_top_vs_worst()
        
        print("\n7. Sharpe Ratio Comparison")
        self.plot_sharpe_comparison()
        
        print("\n" + "="*80)
        print("ALL VISUALIZATIONS COMPLETE!")
        print("="*80 + "\n")
                
        return self

In [40]:
def main():
    """
    Main function
    """
    print("\n" + "="*80)
    print(" "*25 + "STOCK MARKET VISUALIZATIONS")
    print("="*80 + "\n")
    
    # initialize
    viz = StockViz(data_dir='data', output_dir='eda')
    
    # load data
    viz.load_data()
    
    # generate visualizations
    viz.generate_viz()

if __name__ == "__main__":
    main()


                         STOCK MARKET VISUALIZATIONS

Loading stock data

Total stocks loaded: 51

GENERATING VISUALIZATIONS

1. Price History (Top Performers + Benchmark)
Saved: eda/price_history_normalized.png

2. Correlation Matrix
Saved: eda/correlation_matrix.png

3. Sector Comparison (All Sectors)
Saved: eda/sector_comparison_all.png

4. Sector Summary Statistics
Saved: eda/sector_summary_stats.png

5. Risk-Return Scatter
Saved: eda/risk_return_scatter.png

6. Top vs Worst Performers
Saved: eda/top_vs_worst_performers.png

7. Sharpe Ratio Comparison
Saved: eda/sharpe_ratio_comparison.png

ALL VISUALIZATIONS COMPLETE!

