**FNSPID Data**

Part 1. Environment Setup and Authentication

In [None]:
#  Install and import required libraries
print("Installing required libraries...")
!pip install datasets -q
!pip install langdetect -q
!pip install huggingface_hub -q
print("Library installation completed.")

from datasets import load_dataset
from langdetect import detect, LangDetectException
import pandas as pd
from tqdm.notebook import tqdm
from huggingface_hub import login
from google.colab import userdata
from google.colab import drive

#  Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Google Drive mount failed: {e}")

# Hugging Face authentication
try:
    login(token=userdata.get('HF_TOKEN'))
    print("Hugging Face token login successful.")
except Exception as e:
    print(f"Hugging Face login failed. Proceeding anonymously. Error: {e}")


Part 2. Filtering Configuration

In [None]:
# Define filtering conditions
print("\nDefining filtering conditions...")

djia_tickers = {
    "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
    "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
    "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
}

start_date_str = "2021-01-01"
end_date_str = "2023-12-31"
output_path = "/content/drive/MyDrive/djia_news_cleaned_2021_2023.csv"

print(f"Filtering conditions defined. Output file will be saved to: {output_path}")


Part 3. Streaming Dataset Loading

In [None]:
# Load dataset in streaming mode
print("\nLoading dataset in streaming mode...")
try:
    news_stream = load_dataset("Zihan1004/FNSPID", streaming=True)['train']
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Dataset loading failed: {e}")
    news_stream = None


Part 4. One-Pass Streaming Cleaning and Filtering

In [None]:
# Single-pass processing for all tasks
if news_stream:
    print("\n--- Starting single-pass streaming processing ---")

    clean_records = []
    found_tickers = set()
    skipped_record_count = 0

    progress_bar = tqdm(news_stream, desc="Processing data stream")

    for record in progress_bar:
        try:
            # Core modification: use .get() to safely retrieve values inside the loop
            date_val = record.get('Date')
            ticker_val = record.get('Stock_symbol')
            title_val = record.get('Article_title')

            # Skip records if any required field is missing or empty
            if not date_val or not ticker_val or not title_val:
                skipped_record_count += 1
                continue

            # Apply date range and DJIA ticker filtering
            news_date = date_val[:10]
            if not (start_date_str <= news_date <= end_date_str and ticker_val in djia_tickers):
                continue

            found_tickers.add(ticker_val)

            # Language detection check (only validating detectability)
            try:
                detect(title_val)
            except LangDetectException:
                skipped_record_count += 1
                continue

            # Append cleaned record with selected fields only
            clean_records.append({
                'Date': date_val,
                'Article_title': title_val,
                'Stock_symbol': ticker_val
            })

            progress_bar.set_description(
                f"Processed records | Found {len(found_tickers)}/{len(djia_tickers)} tickers"
            )

        except Exception:
            skipped_record_count += 1
            continue

    print("\nStreaming data processing completed.")

    #  Post-processing and reporting
    print(f"\nIntegrity check: Found records for {len(found_tickers)}/{len(djia_tickers)} DJIA companies.")
    missing_tickers = djia_tickers - found_tickers

    if not missing_tickers:
        print("All DJIA tickers were found within the specified time range.")
    else:
        print(f"Warning: {len(missing_tickers)} DJIA tickers were missing: {sorted(list(missing_tickers))}")

    if clean_records:
        # Create DataFrame with only required columns
        df_final_clean_news = pd.DataFrame(clean_records)
        df_final_clean_news = df_final_clean_news.rename(
            columns={'Date': 'date', 'Article_title': 'title', 'Stock_symbol': 'ticker'}
        )
        print("\nColumn names have been renamed.")

        print(f"\nFinal dataset contains {len(df_final_clean_news)} high-quality news records.")
        print(f"{skipped_record_count} records were skipped due to missing or invalid data.")

        print("\nPreview of the final dataset:")
        print(df_final_clean_news.head())

        print(f"\nSaving cleaned dataset to: {output_path} ...")
        df_final_clean_news.to_csv(output_path, index=False)
        print("Data successfully saved to Google Drive.")
    else:
        print("\nNo valid records were found after applying all filters.")
        print(f"{skipped_record_count} records were skipped due to missing or invalid data.")


Part 5. News Volume Statistics by Ticker

In [None]:
from collections import Counter
from datasets import load_dataset
from datetime import datetime
from tqdm import tqdm

# Target tickers
targets = ['CSCO', 'HD', 'HON', 'JNJ', 'JPM', 'MCD', 'PG', 'UNH', 'VZ']

# Counters
counts_all = Counter()
counts_2021_2023 = Counter()

# Date range
start_date = datetime(2021, 1, 1)
end_date = datetime(2023, 12, 31)

# Load FNSPID dataset (no subsets available)
news_stream = load_dataset(
    "Zihan1004/FNSPID",
    split="train",
    streaming=True
)

# Iterate through streaming data with progress bar
for record in tqdm(news_stream, desc="Scanning news records", unit="records"):
    ticker = record.get("Stock_symbol")
    date_str = record.get("Date")

    if not ticker or not date_str:
        continue

    # Count all records
    if ticker in targets:
        counts_all[ticker] += 1

    # Parse date and filter by range
    try:
        date_obj = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S %Z")
    except Exception:
        continue

    if ticker in targets and start_date <= date_obj <= end_date:
        counts_2021_2023[ticker] += 1

# Output comparison results
print("\nOverall counts:")
for t in targets:
    print(f"{t}: {counts_all[t]} records")

print("\nCounts for 2021–2023:")
for t in targets:
    print(f"{t}: {counts_2021_2023[t]} records")

Part 6. Diagnostic Analysis for Rejected Records

In [None]:
from datasets import load_dataset
from tqdm import tqdm
from langdetect import detect, LangDetectException

#Configuration
targets_to_investigate = ['CSCO', 'HD', 'HON', 'JNJ', 'JPM', 'MCD', 'PG', 'UNH', 'VZ']
max_samples_per_ticker = 5  # Maximum rejected samples per ticker
start_date_str = "2021-01-01"
end_date_str = "2023-12-31"

#Load streaming dataset
news_stream = load_dataset("Zihan1004/FNSPID", streaming=True)['train']

#Start diagnostic process
if news_stream:
    print("\n--- Starting diagnostic analysis for target companies ---")

    found_samples = {ticker: [] for ticker in targets_to_investigate}
    tickers_with_enough_samples = set()

    progress_bar = tqdm(news_stream, desc="Scanning data stream for diagnostics")

    for record in progress_bar:
        #Stop early if all target samples are collected
        if len(tickers_with_enough_samples) == len(targets_to_investigate):
            progress_bar.close()
            break

        ticker_val = record.get('Stock_symbol')

        if ticker_val in targets_to_investigate and len(found_samples[ticker_val]) < max_samples_per_ticker:

            rejection_reasons = []  # Store rejection reasons for this record

            #Diagnostic 1: Record completeness check
            date_val = record.get('Date')
            title_val = record.get('Article_title')
            if not date_val or not title_val:
                rejection_reasons.append("Incomplete record (missing date or title)")

            #Diagnostic 2: Date range check
            if date_val:
                try:
                    news_date = date_val[:10]
                    if not (start_date_str <= news_date <= end_date_str):
                        rejection_reasons.append(f"Date out of range (date = {news_date})")
                except (TypeError, IndexError):
                    rejection_reasons.append("Invalid date format")

            #Diagnostic 3: Language check
            if title_val:
                try:
                    if detect(title_val) != 'en':
                        rejection_reasons.append("Non-English language")
                except LangDetectException:
                    rejection_reasons.append("Language detection failed")

            #Record rejected samples
            if rejection_reasons:
                found_samples[ticker_val].append({
                    "record": {
                        "Date": date_val,
                        "Stock_symbol": ticker_val,
                        "Article_title": title_val
                    },
                    "reasons": rejection_reasons
                })

                if len(found_samples[ticker_val]) >= max_samples_per_ticker:
                    tickers_with_enough_samples.add(ticker_val)

    #Print diagnostic results
    print("\n--- Diagnostic Results ---")
    for ticker, samples in found_samples.items():
        if not samples:
            print(f"\n{ticker}: No rejected samples found (either no data or all records passed).")
        else:
            print(f"\n{ticker}: {len(samples)} rejected samples found")
            for s in samples:
                rec = s["record"]
                reasons = "; ".join(s["reasons"])
                print(f"  - Date: {rec['Date']} | Title: {rec['Article_title'][:50]}... | Reason: {reasons}")


**DJIA Price Data Download and Preprocessing (2021–2023)**

In [None]:
# Download Dow Jones Industrial Average (^DJI) price data
djia = yf.download("^DJI", start="2021-01-01", end="2023-12-31")

# Save the raw price data to Google Drive (MyDrive root directory)
save_path = "/content/drive/MyDrive/DJIA_2021_2023.csv"
djia.to_csv(save_path)

print(f"File successfully saved to {save_path}")

# Define input and output file paths
price_path = "/content/drive/My Drive/DJIA_2021_2023.csv"
output_path = "/content/drive/My Drive/DJIA_2021_2023.csv"

# Define column names explicitly
columns = ["date", "open", "high", "low", "close", "volume"]

# Read the DJIA CSV file and remove multi-row headers
price_df = pd.read_csv(
    price_path,
    skiprows=3,      # Skip Price / Ticker / Date rows
    header=None,
    names=columns
)

# Convert the date column to datetime format
price_df["date"] = pd.to_datetime(price_df["date"], errors="coerce")

# Remove rows with invalid or missing dates
price_df = price_df.dropna(subset=["date"])

# Remove duplicate trading days based on the date column
price_df = price_df.drop_duplicates(subset=["date"], keep="first")

# Sort the data chronologically by date
price_df = price_df.sort_values("date").reset_index(drop=True)

# Save the cleaned price data back to Google Drive
price_df.to_csv(output_path, index=False)

print("Cleaning completed")
print(f"Saved to: {output_path}")
print(f"Dataset shape: {price_df.shape}")
print(price_df.head())


**FIQA-2018 Balanced Dataset Construction and Deduplication**

In [None]:
import pandas as pd
from datasets import load_dataset
from google.colab import drive
from pathlib import Path

NEUTRAL_BAND = 0.3
RANDOM_STATE = 42

SAVE_DIR = Path("/content/drive/MyDrive")
SAVE_DIR.mkdir(parents=True, exist_ok=True)
FINAL_SAVE_PATH = SAVE_DIR / "fiqa_1.csv"

drive.mount("/content/drive")

ds = load_dataset("pauri32/fiqa-2018")

df_all = pd.concat(
    [
        pd.DataFrame(ds["train"]),
        pd.DataFrame(ds["validation"]),
        pd.DataFrame(ds["test"])
    ],
    ignore_index=True
)

print(f"Total number of records loaded: {df_all.shape[0]}")

def label_from_score(score, neutral_band=NEUTRAL_BAND):
    if score > neutral_band:
        return "Positive"
    elif score < -neutral_band:
        return "Negative"
    else:
        return "Neutral"

df_all["label"] = df_all["sentiment_score"].apply(
    lambda s: label_from_score(s, NEUTRAL_BAND)
)

def make_balanced_sample(df, random_state=RANDOM_STATE):
    counts = df["label"].value_counts()
    per_class = int(counts.min())

    sampled_parts = []
    for cls in ["Positive", "Neutral", "Negative"]:
        subset = df[df["label"] == cls]
        sampled_parts.append(subset.sample(per_class, random_state=random_state))

    balanced_df = (
        pd.concat(sampled_parts, ignore_index=True)
        .sample(frac=1, random_state=random_state)
        .reset_index(drop=True)
    )
    return balanced_df

balanced_df = make_balanced_sample(df_all)

print("\nBalanced dataset label distribution (before deduplication):")
print(balanced_df["label"].value_counts())

num_dup_before = balanced_df.duplicated().sum()
print("\nDuplicate check before deduplication:")
print(f"Total rows: {balanced_df.shape[0]}")
print(f"Number of duplicated rows: {num_dup_before}")

if num_dup_before > 0:
    print("\nSample duplicated rows:")
    print(balanced_df[balanced_df.duplicated()].head())

balanced_df = balanced_df.drop_duplicates().reset_index(drop=True)

num_dup_after = balanced_df.duplicated().sum()
print("\nDuplicate check after deduplication:")
print(f"Total rows: {balanced_df.shape[0]}")
print(f"Number of duplicated rows: {num_dup_after}")

print("\nBalanced dataset label distribution (after deduplication):")
print(balanced_df["label"].value_counts())

balanced_df.to_csv(FINAL_SAVE_PATH, index=False)
print(f"\nFinal dataset saved to: {FINAL_SAVE_PATH}")


**Exploratory Visualization of FNSPID News, DJIA Prices, and FIQA Sentiment**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load cleaned FNSPID news data
news_path = "/content/drive/MyDrive/djia_news_cleaned_2021_2023.csv"
news_df = pd.read_csv(news_path)

# Load DJIA price data
djia_path = "/content/drive/MyDrive/DJIA_2021_2023.csv"
djia_df = pd.read_csv(djia_path)

# Load FIQA dataset with sentiment labels
fiqa_path = "/content/drive/MyDrive/fiqa_1.csv"
fiqa_df = pd.read_csv(fiqa_path)

# Global plot style settings
plt.rcParams.update({
    "font.size": 14,
    "axes.titlesize": 18,
    "axes.labelsize": 16,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12
})

timeline_color = "#6EC6FF"

# Convert date columns to datetime
news_df["date"] = pd.to_datetime(news_df["date"], errors="coerce")
djia_df["date"] = pd.to_datetime(djia_df["date"], errors="coerce")

# Ensure price column is numeric
djia_df["close"] = pd.to_numeric(djia_df["close"], errors="coerce")

# Plot daily FNSPID news counts
news_counts = (
    news_df.dropna(subset=["date"])
    .groupby(news_df["date"].dt.date)
    .size()
    .sort_index()
)

plt.figure(figsize=(11.5, 2.6))
plt.plot(news_counts.index, news_counts.values, linewidth=1.5, color=timeline_color)
plt.title("FNSPID News: Daily News Counts (2021–2023)")
plt.xlabel("Date")
plt.ylabel("Number of News")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/fnspid_news_distribution.png", dpi=300)
plt.show()

# Plot DJIA closing price trend
djia_plot = djia_df.dropna(subset=["date", "close"]).sort_values("date")

plt.figure(figsize=(11.5, 2.6))
plt.plot(djia_plot["date"], djia_plot["close"], linewidth=1.5, color=timeline_color)
plt.title("DJIA Historical Prices: Closing Price Trend (2021–2023)")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/djia_close_trend.png", dpi=300)
plt.show()

# Plot FIQA sentiment label distribution
label_order = ["Positive", "Neutral", "Negative"]
label_counts = fiqa_df["label"].value_counts().reindex(label_order).fillna(0)

colors = ["#A8E6CF", "#A0CED9", "#FFB7B2"]

plt.figure(figsize=(5, 5))
plt.pie(
    label_counts,
    labels=label_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    colors=colors
)
plt.title("FIQA-2018 Sentiment Label Distribution")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/fiqa_label_distribution.png", dpi=300)
plt.show()

#FNSPID News Count by Ticker (Top 10)
# Count number of news per ticker
ticker_counts = (
    news_df["ticker"]
    .value_counts()
    .head(10)   # Top-N tickers
)

plt.figure(figsize=(6.5, 4))
plt.barh(
    ticker_counts.index[::-1],
    ticker_counts.values[::-1],
    color="#6EC6FF"
)

plt.title("FNSPID News Coverage by Ticker (Top 10)")
plt.xlabel("Number of News Articles")
plt.ylabel("Ticker")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/fnspid_news_by_ticker_top10.png", dpi=300)
plt.show()
