# 1. Load all dependencies and set up environment variables

In [None]:
import dotenv
import pandas as pd

from quotaclimat.data_ingestion.factiva.utils_data_processing.detect_keywords import (
    create_combined_regex_pattern,
)
from quotaclimat.data_ingestion.factiva.utils_data_processing.utils_extract import (
    load_json_values,
    poll_snapshot_explain,
    submit_snapshot_explain,
    get_account_statistics
)

from quotaclimat.data_ingestion.factiva.factiva_to_s3.factiva_api_utils import (
    submit_time_series, poll_time_series, download_time_series_results)

from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS


# Make sure to set FACTIVA_USERKEY in the .env file at the root of the project
dotenv.load_dotenv()

# 2. Set up all extraction parameters

In [None]:
FOLLOWED_SOURCES_PATH = "quotaclimat/data_ingestion/factiva/inputs/followed_sources.json"
START_DATE = "2025-01-01"
END_DATE = "2025-12-31"
MINIMAL_WORD_COUNT = 0

In [None]:
# Create regex pattern for article filtering
keywords_filtered = []
for lst in THEME_KEYWORDS.values():
    for entry in lst:
        if (
            not entry.get("high_risk_of_false_positive", True)
            and entry.get("language") == "french"
        ):
            keywords_filtered.append(entry.get("keyword"))

# keywords_filtered contient la liste désirée
keywords_filtered = list(set(keywords_filtered))

keyword_regex = create_combined_regex_pattern(
    keywords_filtered, bigquery_compatible=True
)

In [None]:
# Set up sources to extract
all_sources = load_json_values(FOLLOWED_SOURCES_PATH)

# 3. Set up Snapshot Explain extract

In [None]:
submit_result = submit_snapshot_explain(
    source_codes=all_sources,
    start_date=START_DATE,
    end_date=END_DATE,
    minimal_word_count=0,
    language_code="fr",
    regex_pattern=keyword_regex,
)

In [None]:
if submit_result["success"]:
    explain_id = submit_result["explain_id"]
    print(f"Job submitted successfully! ID: {explain_id}")


    poll_result = poll_snapshot_explain(
        explain_id=explain_id,
        max_attempts=10,
        wait_seconds=60,
    )
    
    print(f"Polling result: {poll_result}")
else:
    print(f"Error during submission: {submit_result['error']}")

# 4. Set up Time Series extract

In [None]:
submit_ts_result = submit_time_series(
    source_codes=all_sources,
    start_date=START_DATE,
    end_date=END_DATE,
    minimal_word_count=MINIMAL_WORD_COUNT,
    language_code="fr",
    regex_pattern=keyword_regex,
    frequency = "MONTH",
)

In [None]:
if submit_ts_result["success"]:
    analytics_id = submit_ts_result["analytics_id"]
    print(f"Time Series job submitted successfully! ID: {analytics_id}")
    
    poll_ts_result = poll_time_series(
        analytics_id=analytics_id,
        max_attempts=10,
        wait_seconds=30,
    )
    
    print(f"Time Series result: {poll_ts_result}")
else:
    print(f"Error submitting Time Series job: {submit_ts_result['error']}")

download_link = poll_ts_result["download_link"]

In [None]:
data = download_time_series_results(
    download_link=download_link, timeout=30
)
data = pd.DataFrame(data)
data["count"] = data["count"].astype(int)

# 5. Manage account

## 5.1 Verify account statistics

In [None]:
account_statistics = get_account_statistics()
account_statistics