# NOVA FINANCIAL ANALYSIS - EXPLORATORY DATA ANALYSIS
Task 1: Data Understanding and Initial Insights

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

##### LOAD AND EXAMINE STOCK DATA

In [7]:
import os
import pandas as pd

def load_stock_data():
    """Load all available stock data files"""
    stock_symbols = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA', 'TSLA']
    stock_data = {}

    for symbol in stock_symbols:
        try:
            filename = os.path.join("..", "data", "yfinance_data", f"{symbol}_historical_data.csv")
            df = pd.read_csv(filename)
            df['Date'] = pd.to_datetime(df['Date'])
            df['Symbol'] = symbol
            stock_data[symbol] = df
            print(f"✓ Loaded {symbol}: {len(df)} records")
        except FileNotFoundError:
            print(f"✗ Could not load {symbol}_historical_data.csv")

    return stock_data

def load_news_data():
    """Load analyst ratings/news data"""
    try:
        filename = os.path.join("..", "data", "raw_analyst_ratings.csv")
        news_df = pd.read_csv(filename)
        if 'date' in news_df.columns:
            news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce')
        print(f"✓ Loaded news data: {len(news_df)} records")
        return news_df
    except FileNotFoundError:
        print("✗ Could not load raw_analyst_ratings.csv")
        return pd.DataFrame()

# Load the data
print("\n1. LOADING DATA")
print("-" * 30)
stock_data = load_stock_data()
news_data = load_news_data()



1. LOADING DATA
------------------------------
✓ Loaded AAPL: 10998 records
✓ Loaded AMZN: 6846 records
✓ Loaded GOOG: 5020 records
✓ Loaded META: 2926 records
✓ Loaded MSFT: 9672 records
✓ Loaded NVDA: 6421 records
✓ Loaded TSLA: 3545 records
✓ Loaded news data: 1407328 records


### BASIC DATA EXPLORATION


In [8]:

print("\nBASIC DATA STRUCTURE ANALYSIS")

# Examine stock data structure
if stock_data:
    sample_symbol = list(stock_data.keys())[0]
    sample_df = stock_data[sample_symbol]

    print(f"\nStock Data Structure (using {sample_symbol} as example):")
    print(f"Shape: {sample_df.shape}")
    print(f"Columns: {list(sample_df.columns)}")
    print(f"Date range: {sample_df['Date'].min()} to {sample_df['Date'].max()}")
    print(f"Data types:\n{sample_df.dtypes}")

    # Check for missing values
    print(f"\nMissing values:")
    print(sample_df.isnull().sum())

# Examine news data structure
if news_data is not None:
    print(f"\n\nNews Data Structure:")
    print(f"Shape: {news_data.shape}")
    print(f"Columns: {list(news_data.columns)}")
    print(f"Data types:\n{news_data.dtypes}")

    # Check for missing values
    print(f"\nMissing values:")
    print(news_data.isnull().sum())


BASIC DATA STRUCTURE ANALYSIS

Stock Data Structure (using AAPL as example):
Shape: (10998, 10)
Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Dividends', 'Stock Splits', 'Symbol']
Date range: 1980-12-12 00:00:00 to 2024-07-30 00:00:00
Data types:
Date            datetime64[ns]
Open                   float64
High                   float64
Low                    float64
Close                  float64
Adj Close              float64
Volume                   int64
Dividends              float64
Stock Splits           float64
Symbol                  object
dtype: object

Missing values:
Date            0
Open            0
High            0
Low             0
Close           0
Adj Close       0
Volume          0
Dividends       0
Stock Splits    0
Symbol          0
dtype: int64


News Data Structure:
Shape: (1407328, 6)
Columns: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Data types:
Unnamed: 0                        int64
headline              

### DESCRIPTIVE STATISTICS FOR STOCK DATA

In [None]:
def analyze_stock_statistics(stock_data):
    # Generate comprehensive statistics for stock data
    # Combine all stock data
    all_stocks = []
    for symbol, df in stock_data.items():
        all_stocks.append(df)

    combined_df = pd.concat(all_stocks, ignore_index=True)

    print("\n Overall Stock Market Statistics: \n")
    # Basic statistics for numerical columns
    numerical_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    available_cols = [col for col in numerical_cols if col in combined_df.columns]

    if available_cols:
        desc_stats = combined_df[available_cols].describe()
        print(desc_stats)

    # Analysis by symbol
    print(f"\n --Analysis by Stock Symbol-- \n")

    for symbol, df in stock_data.items():
        if 'Close' in df.columns:
            avg_price = df['Close'].mean()
            price_volatility = df['Close'].std()
            avg_volume = df['Volume'].mean() if 'Volume' in df.columns else 'N/A'

            print(f"{symbol}:")
            print(f"  Average Close Price: ${avg_price:.2f}")
            print(f"  Price Volatility (σ): ${price_volatility:.2f}")
            print(f"  Average Volume: {avg_volume:,.0f}" if avg_volume != 'N/A' else f"  Average Volume: {avg_volume}")
            print(f"  Date Range: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
            print()

if stock_data:
    analyze_stock_statistics(stock_data)


 Overall Stock Market Statistics: 

               Open          High           Low         Close     Adj Close  \
count  45428.000000  45428.000000  45428.000000  45428.000000  45428.000000   
mean      46.798370     47.397792     46.190101     46.810590     45.367891   
std       78.259474     79.277620     77.218532     78.274673     78.049554   
min        0.034896      0.035547      0.033333      0.034115      0.031291   
25%        0.690500      0.708000      0.673541      0.693604      0.585204   
50%       13.646101     13.827710     13.463250     13.644750     13.076301   
75%       47.242501     47.720626     46.628125     47.193626     44.768961   
max      542.349976    542.809998    528.359985    539.909973    539.909973   

             Volume  
count  4.542800e+04  
mean   2.177785e+08  
std    3.076645e+08  
min    0.000000e+00  
25%    4.746455e+07  
50%    9.921200e+07  
75%    2.610595e+08  
max    9.230856e+09  

 --Analysis by Stock Symbol-- 

AAPL:
  Average Clos

### TIME SERIES ANALYSIS


In [None]:
def time_series_analysis(stock_data, start_date=None, end_date=None):
    # Analyze time series patterns in stock data within a time range
    # Create visualization for stock price trends
    fig = make_subplots(
        rows=len(stock_data), cols=1,
        subplot_titles=[f"{symbol} Stock Price Trend" for symbol in stock_data.keys()],
        vertical_spacing=0.02
    )
    for i, (symbol, df) in enumerate(stock_data.items(), 1):
        if 'Close' in df.columns and 'Date' in df.columns:
            # Filter by date range if provided
            if start_date and end_date:
                mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
                df_filtered = df.loc[mask]
            else:
                df_filtered = df

            fig.add_trace(
                go.Scatter(x=df_filtered['Date'], y=df_filtered['Close'], name=f"{symbol} Close Price"),
                row=i, col=1
            )

    fig.update_layout(height=300*len(stock_data), title_text="Stock Price Trends Over Time")
    fig.show()

    # Analyze trading volume patterns
    print("\nTrading Volume Analysis:")
    for symbol, df in stock_data.items():
        if 'Volume' in df.columns and 'Date' in df.columns:
            # Filter by date range if provided
            if start_date and end_date:
                mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
                df_filtered = df.loc[mask]
            else:
                df_filtered = df

            # Group by day of week
            df_filtered['DayOfWeek'] = df_filtered['Date'].dt.day_name()
            avg_volume_by_day = df_filtered.groupby('DayOfWeek')['Volume'].mean()

            print(f"\n{symbol} - Average Volume by Day of Week:")
            for day, volume in avg_volume_by_day.items():
                print(f"  {day}: {volume:,.0f}")

if stock_data:
    time_series_analysis(stock_data, start_date='2024-01-01', end_date='2024-07-30')



Trading Volume Analysis:

AAPL - Average Volume by Day of Week:
  Friday: 71,735,586
  Monday: 61,006,489
  Thursday: 66,320,969
  Tuesday: 59,917,119
  Wednesday: 58,023,648

AMZN - Average Volume by Day of Week:
  Friday: 46,877,866
  Monday: 39,002,163
  Thursday: 44,982,483
  Tuesday: 40,054,610
  Wednesday: 41,056,076

GOOG - Average Volume by Day of Week:
  Friday: 23,354,945
  Monday: 21,277,681
  Thursday: 20,751,752
  Tuesday: 18,419,068
  Wednesday: 19,159,566

META - Average Volume by Day of Week:
  Friday: 19,645,555
  Monday: 14,825,026
  Thursday: 18,062,086
  Tuesday: 13,247,810
  Wednesday: 14,983,731

MSFT - Average Volume by Day of Week:
  Friday: 22,265,931
  Monday: 18,256,052
  Thursday: 22,612,045
  Tuesday: 19,984,106
  Wednesday: 19,984,469

NVDA - Average Volume by Day of Week:
  Friday: 477,550,893
  Monday: 431,851,037
  Thursday: 468,005,924
  Tuesday: 440,199,748
  Wednesday: 458,723,848

TSLA - Average Volume by Day of Week:
  Friday: 93,058,552
  Monday:

### Latest available date or end_date for each stock

In [20]:
for symbol, df in stock_data.items():
    if 'Date' in df.columns:
        latest_date = df['Date'].max()
        print(f"{symbol}: {latest_date.strftime('%Y-%m-%d')}")

AAPL: 2024-07-30
AMZN: 2024-07-30
GOOG: 2024-07-30
META: 2024-07-30
MSFT: 2024-07-30
NVDA: 2024-07-30
TSLA: 2024-07-30


## Ploting stock price trends for indiviual

In [21]:
# Define time range
start_date = '2024-01-01'
end_date = '2024-07-30'

#### AAPL Stock Price

In [75]:
df = stock_data['AAPL']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='AAPL Close Price',
    line=dict(color='Blue')
))
fig.update_layout(
    title=dict(
        text=f'AAPL Stock Price from {start_date} to {end_date}',
        x=0.5,
        xanchor='center'
    ),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()


#### AMZN Stock Price

In [77]:
df = stock_data['AMZN']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='AMZN Close Price',
    line=dict(color='#ef553b')
))
fig.update_layout(
    title=dict(
        text=f'AMZN Stock Price from {start_date} to {end_date}',
        x=0.5,
        xanchor='center'
    ),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()


#### GOOG Stock Price

In [78]:
df = stock_data['GOOG']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='GOOG Close Price',
    line=dict(color='#00cc96')
))
fig.update_layout(
    title=dict(
        text=f'GOOG Stock Price from {start_date} to {end_date}',
        x=0.5,
        xanchor='center'
    ),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()

#### META Stock Price

In [79]:
df = stock_data['META']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='META Close Price',
    line=dict(color='#ab63fa')
))
fig.update_layout(
    title=dict(
        text=f'META Stock Price from {start_date} to {end_date}',
        x=0.5,
        xanchor='center'
    ),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()

#### MSFT Stock Price

In [80]:
df = stock_data['MSFT']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='MSFT Close Price',
    line=dict(color='#ffa15a')
))
fig.update_layout(
    title=dict(
        text=f'MSFT Stock Price from {start_date} to {end_date}',
        x=0.5,
        xanchor='center'
    ),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()

#### NVDA Stock Price

In [83]:
df = stock_data['NVDA']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='NVDA Close Price',
    line=dict(color='#19d3f3')
))
fig.update_layout(
    title=dict(
        text=f'NVDA Stock Price from {start_date} to {end_date}',x=0.5,xanchor='center'),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()

#### TSLA Stock Price

In [84]:
df = stock_data['TSLA']
df['Date'] = pd.to_datetime(df['Date'])
# Filter the DataFrame by the date range
df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
# Plot with custom line color and centered title
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_filtered['Date'],
    y=df_filtered['Close'],
    name='TSLA Close Price',
    line=dict(color='#ef653b')
))
fig.update_layout(
    title=dict(
        text=f'TSLA Stock Price from {start_date} to {end_date}',x=0.5,xanchor='center'),
    xaxis_title='Date',
    yaxis_title='Close Price'
)
fig.show()

## Trading Volume Analysis by Day of Week


In [85]:
stocks = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA', 'TSLA']

print(f"Trading Volume Analysis by Day of Week ({start_date} to {end_date})")
for symbol in stocks:
    if symbol in stock_data:
        df = stock_data[symbol].copy()
        df['Date'] = pd.to_datetime(df['Date'])

        # Filter date range
        df_filtered = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

        # Calculate average volume by day of week
        df_filtered['DayOfWeek'] = df_filtered['Date'].dt.day_name()
        avg_volume_by_day = df_filtered.groupby('DayOfWeek')['Volume'].mean()

        print(f"\n{symbol} - Average Trading Volume:")
        print("-" * 40)
        for day, volume in avg_volume_by_day.items():
            print(f"{day}: {volume:,.0f}")
    else:
        print(f"\n{symbol} data not found in stock_data.")


Trading Volume Analysis by Day of Week (2024-01-01 to 2024-07-30)

AAPL - Average Trading Volume:
----------------------------------------
Friday: 71,735,586
Monday: 61,006,489
Thursday: 66,320,969
Tuesday: 59,917,119
Wednesday: 58,023,648

AMZN - Average Trading Volume:
----------------------------------------
Friday: 46,877,866
Monday: 39,002,163
Thursday: 44,982,483
Tuesday: 40,054,610
Wednesday: 41,056,076

GOOG - Average Trading Volume:
----------------------------------------
Friday: 23,354,945
Monday: 21,277,681
Thursday: 20,751,752
Tuesday: 18,419,068
Wednesday: 19,159,566

META - Average Trading Volume:
----------------------------------------
Friday: 19,645,555
Monday: 14,825,026
Thursday: 18,062,086
Tuesday: 13,247,810
Wednesday: 14,983,731

MSFT - Average Trading Volume:
----------------------------------------
Friday: 22,265,931
Monday: 18,256,052
Thursday: 22,612,045
Tuesday: 19,984,106
Wednesday: 19,984,469

NVDA - Average Trading Volume:
--------------------------------

### NEWS DATA ANALYSIS

In [None]:
def analyze_news_data(news_data):
    # Analyze news/analyst ratings data

    if news_data is None:
        print("No news data available for analysis")
        return
    print(f"Total news articles/ratings: {len(news_data)}")

    # Analyze by columns present
    if 'headline' in news_data.columns:
        print(f"\nHeadline Analysis:")
        print(f"  Average headline length: {news_data['headline'].str.len().mean():.1f} characters")
        print(f"  Shortest headline: {news_data['headline'].str.len().min()} characters")
        print(f"  Longest headline: {news_data['headline'].str.len().max()} characters")

    if 'publisher' in news_data.columns:
        print(f"\nPublisher Analysis:")
        publisher_counts = news_data['publisher'].value_counts().head(10)
        print("Top 10 Publishers:")
        for publisher, count in publisher_counts.items():
            print(f"  {publisher}: {count} articles")

    if 'stock' in news_data.columns:
        print(f"\nStock Coverage Analysis:")
        stock_counts = news_data['stock'].value_counts()
        print("Articles per Stock Symbol:")
        for stock, count in stock_counts.items():
            print(f"  {stock}: {count} articles")

    if 'date' in news_data.columns:
        print(f"\nTemporal Analysis:")
        print(f"  Date range: {news_data['date'].min()} to {news_data['date'].max()}")

        # Analyze publication frequency over time
        news_data['date_only'] = news_data['date'].dt.date
        daily_counts = news_data.groupby('date_only').size()
        print(f"  Average articles per day: {daily_counts.mean():.1f}")
        print(f"  Max articles in a day: {daily_counts.max()}")
        print(f"  Days with most activity: {daily_counts.nlargest(3).index.tolist()}")

analyze_news_data(news_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  CBLI: 330 articles
  MVIS: 329 articles
  PPO: 329 articles
  NVRO: 329 articles
  AKAO: 329 articles
  SPB: 328 articles
  PWRD: 328 articles
  CGIX: 328 articles
  CMS: 328 articles
  MDC: 328 articles
  AIZ: 328 articles
  AL: 327 articles
  BONT: 327 articles
  ADMS: 326 articles
  SAFM: 326 articles
  CW: 323 articles
  LJPC: 322 articles
  PLT: 322 articles
  DOX: 322 articles
  CRK: 321 articles
  GENE: 321 articles
  CYOU: 320 articles
  TRNX: 320 articles
  CCXI: 320 articles
  USG: 320 articles
  ACHC: 320 articles
  WDFC: 320 articles
  AEIS: 319 articles
  WBS: 319 articles
  TWOU: 319 articles
  CHU: 319 articles
  STRA: 319 articles
  PACB: 319 articles
  XXII: 318 articles
  HUBG: 318 articles
  UIS: 318 articles
  SMH: 318 articles
  DPLO: 318 articles
  SRCL: 317 articles
  WWAV: 317 articles
  PETS: 317 articles
  PAG: 317 articles
  NGL: 317 articles
  UFS: 317 articles
  BIO: 317 articles
  OZRK: 316

### DATA QUALITY ASSESSMENT

In [None]:
def assess_data_quality(stock_data, news_data):
    # Assess overall data quality

    print("Stock Data Quality:")
    for symbol, df in stock_data.items():
        # Check for missing values
        missing_pct = (df.isnull().sum() / len(df)) * 100

        # Check for duplicated dates
        if 'Date' in df.columns:
            duplicate_dates = df['Date'].duplicated().sum()

        # Check for logical inconsistencies (High < Low, etc.)
        logical_errors = 0
        if all(col in df.columns for col in ['High', 'Low', 'Open', 'Close']):
            logical_errors += (df['High'] < df['Low']).sum()
            logical_errors += (df['High'] < df['Open']).sum()
            logical_errors += (df['High'] < df['Close']).sum()
            logical_errors += (df['Low'] > df['Open']).sum()
            logical_errors += (df['Low'] > df['Close']).sum()

        print(f"\n{symbol}:")
        print(f"  Missing data: {missing_pct.max():.1f}% (worst column)")
        print(f"  Duplicate dates: {duplicate_dates}")
        print(f"  Logical errors: {logical_errors}")

        # Data completeness score
        completeness = 100 - missing_pct.max()
        consistency = 100 - (duplicate_dates/len(df)*100) - (logical_errors/len(df)*100)
        quality_score = (completeness + consistency) / 2

        print(f"  Quality Score: {quality_score:.1f}/100")

    if news_data is not None:
        print(f"\n\nNews Data Quality:")
        print("-" * 18)

        missing_pct = (news_data.isnull().sum() / len(news_data)) * 100
        print(f"Missing data by column:")
        for col, pct in missing_pct.items():
            if pct > 0:
                print(f"  {col}: {pct:.1f}%")

        # Check for duplicate headlines
        if 'headline' in news_data.columns:
            duplicate_headlines = news_data['headline'].duplicated().sum()
            print(f"Duplicate headlines: {duplicate_headlines}")

if stock_data:
    assess_data_quality(stock_data, news_data)


Stock Data Quality:

AAPL:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

AMZN:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

GOOG:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

META:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

MSFT:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

NVDA:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100

TSLA:
  Missing data: 0.0% (worst column)
  Duplicate dates: 0
  Logical errors: 0
  Quality Score: 100.0/100


News Data Quality:
------------------
Missing data by column:
  date: 96.0%
Duplicate headlines: 561558


### SUMMARY AND INSIGHTS

### KEY INSIGHTS AND FINDINGS

1. DATA AVAILABILITY:
   ✓ Multiple stock symbols with historical price data
   ✓ Analyst ratings/news data available
   ✓ Time series data spanning multiple periods

2. DATA QUALITY:
   • Generally high-quality financial data
   • Minimal missing values in stock prices
   • Consistent date formatting across datasets

3. MARKET INSIGHTS:
   • Diverse portfolio of major tech/growth stocks
   • Clear price trends and volatility patterns
   • Volume patterns show trading behavior

4. READINESS FOR ANALYSIS:
   ✓ Data is suitable for technical indicator calculation
   ✓ Time alignment possible between news and stock data
   ✓ Sufficient data points for correlation analysis