In [None]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from ib_insync import IB, Stock
import pandas as pd
from datetime import datetime, timedelta

# --- Helper function to clean headlines ---
def clean_headline(text: str) -> str:
    """
    Removes everything up to and including the first exclamation mark.
    If no exclamation mark is found, returns the text unchanged.
    """
    if "!" in text:
        return text.split("!", 1)[1].strip()
    return text.strip()

async def main():
    ib = IB()

    # Connect asynchronously (with a longer timeout)
    try:
        await ib.connectAsync(host='127.0.0.1', port=7497, clientId=101, timeout=30)
        print("Connected to TWS/Gateway.")
    except Exception as e:
        print(f"Connection error: {e}")
        return

    # Request available news providers.
    news_providers = ib.reqNewsProviders()
    if not news_providers:
        print("No news providers found. Check your IBKR Account Management for API news subscriptions.")
        ib.disconnect()
        return

    print("Available News Providers:")
    for p in news_providers:
        print(f"  Code: {p.code}, Name: {p.name}")

    # Build the provider codes string by joining all provider codes with a plus sign.
    provider_codes = "+".join(p.code for p in news_providers)
    print("Using provider codes for historical news:", provider_codes)

    # List of tickers to process.
    tickers = ["NVDA", "AMZN", "AAPL", "TSLA", "BAC", "INTC", "PFE", "SMCI", "AMD", "MARA", 
 "PLTR", "RIVN", "NIO", "SNAP", "PLUG", "LCID", "TLRY", "SOFI", "SOUN", "CLSK"]

    # Master list to store collected news items as tuples:
    # (Ticker, Time, Provider, ArticleId, Headline)
    master_news_list = []

    # Define the date range for historical news (for example, January 2024)
    news_start_date = datetime(2024, 1, 1)
    news_end_date   = datetime(2024, 12, 31)
    max_headlines   = 300

    # Process each ticker sequentially.
    for ticker in tickers:
        print("\n" + "="*80)
        print(f"Processing ticker: {ticker}")

        # Create a stock contract and get its conId.
        contract = Stock(ticker, 'SMART', 'USD')
        details = ib.reqContractDetails(contract)
        if not details:
            print(f"No contract details found for {ticker}. Skipping.")
            continue

        con_id = details[0].contract.conId
        print(f"{ticker} conId: {con_id}")

        # Local list to store news items for this ticker.
        local_news_list = []

        # --- Callback functions for historical news ---
        def historicalNews(reqId, timeStr, providerCode, articleId, headline, current_ticker=ticker):
            print(f"[{current_ticker} Historical News] Time={timeStr}, Provider={providerCode}, ArticleId={articleId}")
            print(f"                          Headline={headline}")
            local_news_list.append((current_ticker, timeStr, providerCode, articleId, headline))

        def historicalNewsEnd(reqId, hasMore):
            print(f"[{ticker} Historical News End] ReqId={reqId}, hasMore={hasMore}")

        # Set the callbacks.
        ib.wrapper.historicalNews = historicalNews
        ib.wrapper.historicalNewsEnd = historicalNewsEnd

        # Request historical news day‐by‐day.
        current_date = news_start_date
        while current_date <= news_end_date:
            formatted_start = current_date.strftime("%Y-%m-%d 00:00:00")
            formatted_end   = current_date.strftime("%Y-%m-%d 23:59:59")
            print(f"[{ticker}] Requesting news from '{formatted_start}' to '{formatted_end}'")
            try:
                ib.reqHistoricalNews(
                    con_id,
                    provider_codes,
                    formatted_start,
                    formatted_end,
                    max_headlines,
                    []  # additional options if needed
                )
            except Exception as e:
                print(f"Error during request for {formatted_start} for {ticker}: {e}")
            await asyncio.sleep(1)  # pause between daily requests
            current_date += timedelta(days=1)

        # Allow extra time for all historical news callbacks to arrive.
        print(f"Waiting for historical news responses for {ticker}...")
        await asyncio.sleep(10)

        # Process local results.
        for tkr, time_str, provider, articleId, headline in local_news_list:
            try:
                news_time = datetime.strptime(time_str.split(".")[0], "%Y-%m-%d %H:%M:%S")
            except Exception as e:
                print(f"Error parsing time '{time_str}' for {ticker}: {e}")
                continue
            if news_start_date <= news_time <= news_end_date:
                master_news_list.append((tkr, news_time, provider, articleId, headline))

        print(f"{ticker} produced {len(local_news_list)} headlines.")

    print("\n=== Summary of Collected Historical Headlines ===")
    print(f"Total headlines (before duplicate removal): {len(master_news_list)}")

    # Create a DataFrame from the collected news data.
    # Columns: Ticker, Time, Provider, ArticleId, Headline
    news_df = pd.DataFrame(master_news_list, columns=["Ticker", "Time", "Provider", "ArticleId", "Headline"])

    # Clean the headlines by removing everything up to (and including) the first "!".
    news_df["Headline"] = news_df["Headline"].apply(clean_headline)

    # Remove duplicate rows based on Ticker, Time, Provider, and Headline.
    news_df = news_df.drop_duplicates(subset=["Ticker", "Time", "Provider", "Headline"])
    print(f"Total unique headlines after duplicate removal: {len(news_df)}")

    # --- Final DataFrame cleanup ---
    # Remove the ArticleId column as it is not needed in the final output.
    news_df.drop(columns=["ArticleId"], inplace=True)
    # Reorder columns so that Ticker is the first column.
    final_cols = ["Ticker", "Time", "Provider", "Headline"]
    news_df = news_df[final_cols]

    # Save the DataFrame to an Excel file.
    output_filename = "bac.xlsx"
    news_df.to_excel(output_filename, sheet_name="Historical News 2024", index=False)
    print(f"\nFiltered data saved to '{output_filename}'.")

    # Disconnect from IBKR.
    ib.disconnect()
    print("Disconnected from TWS/Gateway.")

if __name__ == "__main__":
    asyncio.run(main())
