# Stock Data Analysis Demo

This notebook demonstrates how to use the `stock_analysis.py` script to analyze stock market data from the `StockData.csv` file.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Import functions from our analysis script
from stock_analysis import load_data, clean_data, basic_stats, plot_top_stocks, analyze_time_trends, correlation_analysis

## 1. Load and Explore the Data

First, let's load the CSV file and take a look at its structure.

In [None]:
# Load the data
file_path = 'data/StockData.csv'
df = load_data(file_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

In [None]:
# Check data types and missing values
print("Data types:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isnull().sum())

# Get unique tickers
unique_tickers = df['Ticker'].unique()
print(f"\nNumber of unique stock tickers: {len(unique_tickers)}")
print(f"Sample tickers: {unique_tickers[:10]}")

## 2. Clean the Data

Now let's clean the data to handle missing values and improve data types.

In [None]:
# Clean the data
cleaned_df = clean_data(df)

# Check the cleaned dataframe
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")

# Display the cleaned data
cleaned_df.head()

In [None]:
# Check the new date column
if 'Date' in cleaned_df.columns:
    print("Date range:")
    print(f"Earliest date: {cleaned_df['Date'].min()}")
    print(f"Latest date: {cleaned_df['Date'].max()}")

## 3. Basic Statistics

Let's calculate and explore some basic statistics about the stocks.

In [None]:
# Calculate basic statistics
stats = basic_stats(cleaned_df)

# Display summary statistics
print("Overall summary statistics:")
cleaned_df.describe()

In [None]:
# Find top stocks by EPS
top_eps = cleaned_df.groupby('Ticker')['EPS'].mean().sort_values(ascending=False).head(10)
print("Top 10 stocks by average EPS:")
top_eps

In [None]:
# Find top stocks by Revenue
top_revenue = cleaned_df.groupby('Ticker')['Revenue'].mean().sort_values(ascending=False).head(10)
print("Top 10 stocks by average Revenue:")
top_revenue

## 4. Data Visualization

Now let's create some visualizations to better understand the data.

In [None]:
# Plot top stocks by EPS
plt.figure(figsize=(12, 6))
top_eps.plot(kind='bar')
plt.title('Top 10 Stocks by Average EPS')
plt.ylabel('EPS')
plt.xlabel('Ticker')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot top stocks by Revenue
plt.figure(figsize=(12, 6))
top_revenue.plot(kind='bar')
plt.title('Top 10 Stocks by Average Revenue')
plt.ylabel('Revenue (in millions)')
plt.xlabel('Ticker')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Time Series Analysis

Let's examine how stock metrics change over time for a specific ticker.

In [None]:
# Select a ticker for time series analysis
# Using the top EPS performer
top_ticker = top_eps.index[0]
print(f"Analyzing time series for {top_ticker}")

# Filter for the specific ticker
ticker_df = cleaned_df[cleaned_df['Ticker'] == top_ticker].sort_values('Date')
ticker_df.head()

In [None]:
# Plot EPS and Price over time
plt.figure(figsize=(14, 10))

# EPS subplot
plt.subplot(2, 1, 1)
plt.plot(ticker_df['Date'], ticker_df['EPS'], marker='o')
plt.title(f'{top_ticker} - EPS Over Time')
plt.ylabel('EPS')
plt.grid(True, alpha=0.3)

# Price subplot
plt.subplot(2, 1, 2)
plt.plot(ticker_df['Date'], ticker_df['Price'], marker='o', color='green')
plt.title(f'{top_ticker} - Price Over Time')
plt.ylabel('Price ($)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

Let's analyze correlations between different stock metrics.

In [None]:
# Calculate correlations for numerical columns
corr_matrix = correlation_analysis(cleaned_df)
corr_matrix

In [None]:
# Create a more detailed correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title('Correlation Matrix of Stock Metrics')
plt.tight_layout()
plt.show()

## 7. Sector-based Analysis

If we had sector information, we could analyze performance by sector. For now, we'll simulate this by grouping companies alphabetically.

In [None]:
# Create a simple group based on first letter of ticker
cleaned_df['AlphaGroup'] = cleaned_df['Ticker'].str[0]

# Calculate mean EPS by alpha group
alpha_group_eps = cleaned_df.groupby('AlphaGroup')['EPS'].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(12, 6))
alpha_group_eps.plot(kind='bar')
plt.title('Average EPS by Ticker First Letter')
plt.ylabel('Average EPS')
plt.xlabel('First Letter of Ticker')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Advanced Analysis: EPS and Price Relationship

Let's examine if there's a relationship between EPS and stock price.

In [None]:
# Create a scatter plot of EPS vs Price
plt.figure(figsize=(10, 8))
plt.scatter(cleaned_df['EPS'], cleaned_df['Price'], alpha=0.5)
plt.title('Relationship between EPS and Stock Price')
plt.xlabel('Earnings Per Share (EPS)')
plt.ylabel('Stock Price ($)')
plt.grid(True, alpha=0.3)

# Add regression line
if 'EPS' in cleaned_df.columns and 'Price' in cleaned_df.columns:
    # Remove NaN values
    valid_data = cleaned_df.dropna(subset=['EPS', 'Price'])
    
    if len(valid_data) > 1:
        x = valid_data['EPS']
        y = valid_data['Price']
        
        # Calculate regression line
        slope, intercept = np.polyfit(x, y, 1)
        regression_line = slope * x + intercept
        
        # Plot regression line
        plt.plot(x, regression_line, color='red', linewidth=2)
        plt.text(x.max()*0.7, y.min()*1.1, f'y = {slope:.2f}x + {intercept:.2f}', fontsize=12)

plt.tight_layout()
plt.show()

## 9. Dividend Analysis

Let's analyze the dividend amounts across different stocks.

In [None]:
# Find top dividend payers
if 'DivAmt' in cleaned_df.columns:
    top_div = cleaned_df.groupby('Ticker')['DivAmt'].mean().sort_values(ascending=False).head(10)
    
    plt.figure(figsize=(12, 6))
    top_div.plot(kind='bar')
    plt.title('Top 10 Stocks by Average Dividend Amount')
    plt.ylabel('Dividend Amount ($)')
    plt.xlabel('Ticker')
    plt.xticks(rotation=45)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 10. Conclusion

In this notebook, we've analyzed the stock data from multiple angles:

1. Basic data exploration and cleaning
2. Statistical analysis of stock performance
3. Visualization of top performers
4. Time series analysis for selected stocks
5. Correlation analysis between different metrics
6. Group-based analysis
7. Relationship between EPS and stock price
8. Dividend analysis

This analysis provides insights into stock performance metrics and their relationships. Further analysis could include:

- More sophisticated time series forecasting
- Sector-based analysis with proper sector classifications
- Portfolio optimization based on these metrics
- Risk analysis using volatility measures