In [1]:
import pandas as pd
from pathlib import Path
import sys

In [2]:
# Ajouter le répertoire des scripts au path pour pouvoir importer les modules
scripts_path = Path().resolve().parent / "scripts"
if str(scripts_path) not in sys.path:
    sys.path.append(str(scripts_path))

from scripts.ingest.ingest_synth import load_synth_docs
from scripts.ingest.ingest_cfpb import load_cfpb_docs
from scripts.ingest.ingest_enron_mail import load_enron_docs

In [5]:
# --- Chargement des données ---
print("Chargement des données synthétiques...")
synth_docs = load_synth_docs()
synth_docs

Chargement des données synthétiques...


[Document(metadata={'date': '2025-09-29', 'customer_id': 'e71320a6-0a0e-4203-b014-f1892aa20454', 'priority': 'medium', 'source': 'synthetic', 'id': '4a54fee1-d078-4a27-b604-333488bc9464', 'lang': 'en', 'type': 'ticket_support'}, page_content='Unauthorized charge on account [ACCOUNT]\n\nHello,\n\nI have noticed a failed direct debit affecting my account [ACCOUNT]. The transaction reference is [NUMBER] and occurred on 2025-09-29. Please advise the next steps and confirm any refund process.\n\nCustomer: Cameron Lewis - Email: [email]\n'),
 Document(metadata={'date': '2025-09-27', 'customer_id': '11f1bd3e-aadb-4955-9623-4b799bfac0ec', 'priority': 'low', 'source': 'synthetic', 'id': 'cbe10fd6-d624-40e3-8a4e-7e417c75d7fb', 'lang': 'en', 'type': 'email_finance'}, page_content='Dispute transaction #[NUMBER] dated 2025-09-27\n\nDear support,\n\nI would like to request the statement for the period 2025-08-05 to 2025-08-15. My account is [ACCOUNT]. Please send as PDF.\n\nBest, Gregory Brown\n[ema

In [6]:
df_synth = pd.DataFrame([doc.metadata | {"content": doc.page_content} for doc in synth_docs])

In [7]:
df_synth.head(5)

Unnamed: 0,date,customer_id,priority,source,id,lang,type,content,product_category,total,change
0,2025-09-29,e71320a6-0a0e-4203-b014-f1892aa20454,medium,synthetic,4a54fee1-d078-4a27-b604-333488bc9464,en,ticket_support,Unauthorized charge on account [ACCOUNT]\n\nHe...,,,
1,2025-09-27,11f1bd3e-aadb-4955-9623-4b799bfac0ec,low,synthetic,cbe10fd6-d624-40e3-8a4e-7e417c75d7fb,en,email_finance,Dispute transaction #[NUMBER] dated 2025-09-27...,,,
2,2025-10-10,,,synthetic,bf1318dd-b433-491b-8eb8-550bc3cbff03,fr,report_summary,Résumé hebdomadaire des ventes pour cartes ban...,cartes bancaires,133267.09,-0.155
3,2025-08-19,,,synthetic,43349b7e-78e2-48d6-bcda-abd01820987b,en,report_summary,Weekly sales summary for credit cards\n\nRepor...,credit cards,124481.27,-0.104
4,2025-10-08,,,synthetic,f96cb194-cc62-4285-9e7e-1c16dad1c6ed,en,report_summary,Weekly sales summary for mortgages\n\nReport f...,mortgages,342040.78,-0.047


In [8]:
df_synth.isnull().sum()

date                 0
customer_id         25
priority            25
source               0
id                   0
lang                 0
type                 0
content              0
product_category    75
total               75
change              75
dtype: int64

In [9]:
print("Chargement des données CFPB (limite de 1000)...")
cfpb_docs = load_cfpb_docs(limit=1000)
df_cfpb = pd.DataFrame([doc.metadata | {"content": doc.page_content} for doc in cfpb_docs])

Chargement des données CFPB (limite de 1000)...
DF length before dropna: 10000


In [10]:
df_cfpb.shape

(498, 15)

In [11]:
df_cfpb.head(10)

Unnamed: 0,id,product,sub_product,issue,sub_issue,state_geo,company,company_response,timely_response,consumer_consent,date_received,date_sent_to_company,source,lang,content
0,3642453,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,NV,Experian Information Solutions Inc.,Closed with explanation,Yes,Consent provided,2020-05-08,2020-05-08,cfpb,en,These are not my accounts.
1,15070435,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,Fraud or scam,,NJ,"Early Warning Services, LLC",Closed with explanation,Yes,Consent provided,2025-08-04,2025-08-04,cfpb,en,On XX/XX/2025 at XXXXXXXX XXXX I contacted XX...
2,8113747,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,IL,Experian Information Solutions Inc.,Closed with non-monetary relief,Yes,Consent provided,2024-01-05,2024-01-05,cfpb,en,Kindly address this issue on my credit report....
3,15267150,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Personal information incorrect,CA,Experian Information Solutions Inc.,Closed with explanation,Yes,Consent provided,2025-08-13,2025-08-13,cfpb,en,I have never legally gone by these personal in...
4,10640055,Debt collection,I do not know,Attempts to collect debt not owed,Debt was paid,TX,Resurgent Capital Services L.P.,Closed with non-monetary relief,Yes,Consent provided,2024-10-31,2024-10-31,cfpb,en,Your agency placed an unauthorized collection ...
5,16109385,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Overcharged for something you did purchase wit...,DE,U.S. BANCORP,Closed with explanation,Yes,Consent provided,2025-09-22,2025-09-22,cfpb,en,"On XX/XX/year> in XXXX XXXX, we took what we t..."
6,15698868,Credit card,General-purpose credit card or charge card,Fees or interest,Problem with fees,KY,FIRST PORTFOLIO SERVICING INC,Closed with explanation,Yes,Consent provided,2025-09-03,2025-09-16,cfpb,en,I opened a REVVI card in an effort to build cr...
7,12085091,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,FL,"Lockhart, Morris & Montgomery Inc.",Closed with explanation,Yes,Consent provided,2025-02-14,2025-02-21,cfpb,en,There are XXXX collections being reported to t...
8,15330926,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,NJ,"EQUIFAX, INC.",Closed with non-monetary relief,Yes,Consent provided,2025-08-16,2025-08-16,cfpb,en,"Violation of Gramm-Leach-Bliley Act ( GLBA ), ..."
9,15309050,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,PA,Experian Information Solutions Inc.,Closed with explanation,Yes,Consent provided,2025-08-15,2025-08-15,cfpb,en,"Finding unauthorized accounts, credit inquirie..."


In [12]:
print("Chargement des données Enron (limite de 1000)...")
enron_docs = load_enron_docs(limit=1000)
df_enron = pd.DataFrame([doc.metadata | {"content": doc.page_content} for doc in enron_docs])

Chargement des données Enron (limite de 1000)...


In [13]:
df_enron.head()

Unnamed: 0,id,subject,from,to,date,file_path,source,lang,content
0,<29790972.1075855665306.JavaMail.evans@thyme>,"December 14, 2000 - Bear Stearns' predictions ...",1.11913372.-2@multexinvestornetwork.com,pallen@enron.com,"Wed, 13 Dec 2000 18:41:00 -0800",allen-p\all_documents\1,enron,en,"Subject: December 14, 2000 - Bear Stearns' pre..."
1,<21975671.1075855665520.JavaMail.evans@thyme>,Bloomberg Power Lines Report,messenger@ecm.bloomberg.com,,"Wed, 13 Dec 2000 08:35:00 -0800",allen-p\all_documents\10,enron,en,Subject: Bloomberg Power Lines Report\n\nHere ...
2,<7452188.1075855667684.JavaMail.evans@thyme>,Consolidated positions: Issues & To Do list,phillip.allen@enron.com,keith.holst@enron.com,"Mon, 09 Oct 2000 07:16:00 -0700",allen-p\all_documents\100,enron,en,Subject: Consolidated positions: Issues & To D...
3,<23790115.1075855667708.JavaMail.evans@thyme>,Consolidated positions: Issues & To Do list,phillip.allen@enron.com,keith.holst@enron.com,"Mon, 09 Oct 2000 07:00:00 -0700",allen-p\all_documents\101,enron,en,Subject: Consolidated positions: Issues & To D...
4,<5860470.1075855667730.JavaMail.evans@thyme>,,phillip.allen@enron.com,david.delainey@enron.com,"Thu, 05 Oct 2000 06:26:00 -0700",allen-p\all_documents\102,enron,en,"Subject: \n\nDave, \n\n Here are the names of ..."
