# 1. Load all dependencies and set up environment variables

In [None]:
import dotenv

from quotaclimat.data_ingestion.factiva.utils_data_processing.detect_keywords import (
    create_combined_regex_pattern,
)
from quotaclimat.data_ingestion.factiva.utils_data_processing.utils_extract import (
    _build_factiva_where_clause,
    create_streaming_instance,
    delete_stream,
    get_stream_extended,
    get_streams,
    load_json_values,
    poll_snapshot_explain,
    submit_snapshot_explain,
    get_account_statistics
)
from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS


# Make sure to set FACTIVA_USERKEY in the .env file at the root of the project
dotenv.load_dotenv()

# 2. Set up all extraction parameters

In [None]:
FOLLOWED_SOURCES_PATH = "quotaclimat/data_ingestion/factiva/inputs/followed_sources.json"
START_DATE = "2025-11-15"
MINIMAL_WORD_COUNT = 0
LAUNCH_CREATE_STREAMING_INSTANCE = False

In [None]:
# Create regex pattern for article filtering
keywords_filtered = []
for lst in THEME_KEYWORDS.values():
    for entry in lst:
        if (
            not entry.get("high_risk_of_false_positive", True)
            and entry.get("language") == "french"
        ):
            keywords_filtered.append(entry.get("keyword"))

# keywords_filtered contient la liste désirée
keywords_filtered = list(set(keywords_filtered))

keyword_regex = create_combined_regex_pattern(
    keywords_filtered, bigquery_compatible=True
)


In [None]:
# Set up sources to extract
all_sources = load_json_values(FOLLOWED_SOURCES_PATH)

# 3. Verify the associated number of items to extract before creating a streaming instance

In [None]:
submit_result = submit_snapshot_explain(
    source_codes=all_sources,
    start_date=START_DATE,
    end_date=None,
    minimal_word_count=0,
    language_code="fr",
    regex_pattern=keyword_regex,
)

if submit_result["success"]:
    explain_id = submit_result["explain_id"]
    print(f"Job submitted successfully! ID: {explain_id}")


    poll_result = poll_snapshot_explain(
        explain_id=explain_id,
        max_attempts=10,
        wait_seconds=60,
    )
    
    print(f"Polling result: {poll_result}")
else:
    print(f"Error during submission: {submit_result['error']}")

# 4. Create Streaming Instance

## Warning: this action will consume Factiva credits. Please check the number of articles extracted beforehand.

In [None]:
if LAUNCH_CREATE_STREAMING_INSTANCE:
    stream_info = create_streaming_instance(
        source_codes=all_sources,
        start_date=START_DATE,
        minimal_word_count=0,
        language_code="fr",
        regex_pattern=keyword_regex,
        )

# 5. Manage streaming instance

## 5.1 Check straming instance status

In [None]:
streams_overview = get_streams()
streams_overview

In [None]:
stream_id = 'xxx'
stream_extended = get_stream_extended(stream_id=stream_id)
stream_extended

## 5.2 Verify Factiva account statistics

In [None]:
account_statistics = get_account_statistics()
account_statistics

## 5.3 Delete steaming instance

In [None]:
results_delete = delete_stream(stream_id='xxx')