In [1]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import sys
import os
import seaborn as sns

In [None]:
# ---
# title: Time Series Analysis for 2020 Financial News Dataset
# description: Notebook to analyze publication frequency and publishing times of news articles
# ---

# ### 1. Adjust Python Path
# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.time_series_utils import get_publication_frequency, extract_publication_hour

# Set plot style
sns.set_style("whitegrid")

# ### 2. Load Cleaned News Dataset
news_columns = ['headline', 'publisher_domain', 'date', 'date_only', 'stock']
news_df = pd.read_csv('../data/fnspid_news_cleaned_2020.csv')
for col in news_columns:
    if col not in news_df.columns:
        raise ValueError(f"Missing column: {col}")

# Convert date to datetime
news_df['date'] = pd.to_datetime(news_df['date'], utc=True)
news_df['date_only'] = pd.to_datetime(news_df['date_only'])

# ### 3. Publication Frequency Over Time
# Get publication frequency
freq_df = get_publication_frequency(news_df, date_column='date_only')
print("=== Publication Frequency (Sample) ===")
display(freq_df.head(10))

# Visualize publication frequency
plt.figure(figsize=(12, 6))
plt.plot(freq_df['Date'], freq_df['Article_Count'], color='blue')
plt.title('Daily Publication Frequency of News Articles (2020)')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('plots/publication_frequency_2020.png')
plt.show()

# Identify spikes (top 5% days)
spike_threshold = freq_df['Article_Count'].quantile(0.95)
spike_days = freq_df[freq_df['Article_Count'] >= spike_threshold]
print("\n=== Spike Days in Publication Frequency ===")
display(spike_days)

# ### 4. Publication Hour Analysis
# Extract publication hour
news_df = extract_publication_hour(news_df, date_column='date')
print("\n=== Sample with Publication Hour ===")
display(news_df[['headline', 'date', 'publication_hour']].head())

# Count articles by hour
hour_counts = news_df['publication_hour'].value_counts().sort_index()
hour_df = pd.DataFrame({'Hour': hour_counts.index, 'Article_Count': hour_counts.values})
print("\n=== Articles by Publication Hour ===")
display(hour_df)

# Visualize publication hour distribution
plt.figure(figsize=(10, 6))
sns.barplot(x='Hour', y='Article_Count', data=hour_df, color='green')
plt.title('Distribution of News Articles by Publication Hour (2020)')
plt.xlabel('Hour of Day (UTC)')
plt.ylabel('Number of Articles')
plt.tight_layout()
plt.savefig('plots/publication_hour_distribution_2020.png')
plt.show()

# ### 5. Save Results
freq_df.to_csv('../data/publication_frequency_2020.csv', index=False)
hour_df.to_csv('../data/publication_hour_2020.csv', index=False)