In [None]:
# --- Task 1: Exploratory Data Analysis (EDA) ---

import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
# Uncomment and run the line below ONLY if you get an error that says 'punkt' is missing
# nltk.download('punkt') 

# Define paths (assuming you are running this from the 'notebooks' folder)
RAW_DATA_PATH = os.path.join('..', 'data', 'raw')
PROCESSED_DATA_PATH = os.path.join('..', 'data', 'processed')
TICKERS = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']

print("Starting Task 1: Data Loading and Preparation...")

# --- 1. Load All Stock Data ---
all_stock_data = {}
for ticker in TICKERS:
    file_path = os.path.join(RAW_DATA_PATH, f'{ticker}.csv')
    try:
        df = pd.read_csv(file_path)
        
        # Data Cleaning: Convert to datetime and set as index
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.set_index('Date').sort_index()
        
        # Calculate Daily Returns (Core feature for Task 3)
        df['Daily_Return'] = df['Close'].pct_change() * 100
        
        all_stock_data[ticker] = df
    except FileNotFoundError:
        print(f"WARNING: Stock file for {ticker} not found at {file_path}")

print(f"\nSuccessfully loaded stock data for: {list(all_stock_data.keys())}")
print("\nAAPL Stock Data Head:")
print(all_stock_data['AAPL'].head())


# --- 2. Load and Prepare News Data (FINAL ROBUST FIX) ---
try:
    NEWS_FILE = 'raw_analyst_ratings.csv' 
    df_news = pd.read_csv(os.path.join(RAW_DATA_PATH, NEWS_FILE))

    print(f"\nNews file '{NEWS_FILE}' loaded. Applying date conversion fix...")

    # *** FINAL ROBUST FIX *** # This single line performs the conversion and normalization in the most stable way.
    df_news['Date'] = pd.to_datetime(
        df_news['date'], 
        format='mixed',           
        errors='coerce',
        utc=True              
    ).dt.normalize()

    # Drop the original string column and rows where date conversion failed (NaT values)
    df_news = df_news.dropna(subset=['Date'])
    df_news = df_news.drop(columns=['date']).set_index('Date').sort_index().copy() 

    print("\nNews Data Info after successful date conversion:")
    df_news.info()
    
except FileNotFoundError:
    print(f"FATAL ERROR: News file not found. Check if the file name '{NEWS_FILE}' is correct and is in '{RAW_DATA_PATH}'.")


# ----------------------------------------------------------------------------------
# --- The rest of the Task 1 analysis (Publisher, Sentiment, Visualization) ---
# ----------------------------------------------------------------------------------

# --- 3. Publisher Analysis ---
print("\n--- 3. Publisher Analysis ---")
publisher_counts = df_news['publisher'].value_counts()
print("Top 10 Publishers by Article Count:")
print(publisher_counts.head(10).to_string())


# --- 4. Sentiment Analysis (TextBlob) ---
print("\n--- 4. Sentiment Analysis ---")

def get_sentiment(text):
    """Calculates TextBlob sentiment polarity (range: -1 to 1)."""
    if pd.isna(text):
        return 0.0
    # TextBlob requires the input to be a string
    return TextBlob(str(text)).sentiment.polarity

# Apply sentiment analysis to all headlines
df_news['sentiment_score'] = df_news['headline'].apply(get_sentiment)

# Aggregate daily average sentiment
daily_sentiment = df_news.groupby(df_news.index)['sentiment_score'].mean().rename('Avg_Daily_Sentiment')

print("\nAverage Daily Sentiment Descriptive Stats:")
print(daily_sentiment.describe())


# --- 5. Visualization: Average Daily Sentiment  ---
plt.figure(figsize=(14, 6))
daily_sentiment.plot(
    title='Average Daily News Sentiment Over Time',
    color='darkorange',
    linewidth=1.5
)
plt.xlabel("Date")
plt.ylabel("Average Sentiment Score (-1: Negative, 1: Positive)")
# Maximize Data-Ink Ratio
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# --- 6. Save Merged Sentiment Data for Task 3 ---
# Ensure the processed data folder exists
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
daily_sentiment.to_csv(os.path.join(PROCESSED_DATA_PATH, 'daily_sentiment.csv'))

print("\n--- Task 1: EDA Complete. Daily sentiment saved for Task 3. ---")

Starting Task 1: Data Loading and Preparation...

Successfully loaded stock data for: ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']

AAPL Stock Data Head:
               Close      High       Low      Open      Volume  Daily_Return
Date                                                                        
2009-01-02  2.721686  2.730385  2.554037  2.575630   746015200           NaN
2009-01-05  2.836553  2.884539  2.780469  2.794266  1181608400      4.220416
2009-01-06  2.789767  2.914229  2.770872  2.877641  1289310400     -1.649399
2009-01-07  2.729484  2.774170  2.706990  2.753477   753048800     -2.160860
2009-01-08  2.780169  2.793666  2.700393  2.712090   673500800      1.856959

News file 'raw_analyst_ratings.csv' loaded. Applying date conversion fix...

News Data Info after successful date conversion:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1407328 entries, 2009-02-14 00:00:00+00:00 to 2020-06-11 00:00:00+00:00
Data columns (total 5 columns):
 #   Column      No