## Nettoyage des datasets

### Car_Reviews

In [0]:
%pip install -U mlflow
dbutils.library.restartPython()
# ML & NLP
%pip install -U sentence-transformers
%pip install -U openai

In [0]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from collections import Counter
import re


from sentence_transformers import SentenceTransformer


from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# OpenAI
from openai import OpenAI

In [0]:
#chargement du dataset avec spark et conversion en pandas
sdf = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv("gs://bucket-autoai/Car_Reviews.csv")

df1 = sdf.toPandas()
#display(df1)



In [0]:
df1.info()

In [0]:
# Convertir en string les colonnes en object
text_cols = ["Review", "Vehicle_Title"]

for col in text_cols:
    df1[col] = df1[col].astype("string")

In [0]:
#Suppression des doublons
df1=df1.drop_duplicates()

In [0]:
#Remplace les valeurs manquantes de Review
df1["Review"] = df1["Review"].fillna("")
#display(df1)


In [0]:
df1["modelYear"] = df1["Vehicle_Title"].str.split().str[0]
df1["make"] = df1["Vehicle_Title"].str.split().str[1]
df1["model"] = df1["Vehicle_Title"].str.split().str[2]
df1["summary"] = df1["Review"]
display(df1)

In [0]:
df1 = df1.drop(columns=["Recommend", "Vehicle_Title", "Review"])
display(df1)

In [0]:
df1['source']='SAV'
display(df1)

### Complaints

In [0]:
sdf0 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv("gs://bucket-autoai/complaints.csv")

df2 = sdf0.toPandas()
display(df2)

In [0]:
df2.shape

In [0]:
df2.duplicated().sum()

In [0]:
df2.info()

In [0]:
df2=df2[['model','modelYear','make','summary']]
df2['source']='REVIEW'
df2.shape

In [0]:
df2= df2.drop_duplicates()
df2.shape

In [0]:
display(df2.isna().sum())

In [0]:
# Convertir en string les colonnes en object
text_cols = ['summary', 'make', 'model']

for col in text_cols:
    df2[col] = df2[col].astype("string")

In [0]:
import pandas as pd
df_concat = pd.concat([df1, df2], ignore_index=True)
print(df_concat)

In [0]:
df_concat.head()


In [0]:
str_cols = df_concat.select_dtypes(include=["object", "string"]).columns
df_concat[str_cols] = df_concat[str_cols].apply(lambda s: s.str.lower())


In [0]:
df_concat.head()

In [0]:
start = pd.Timestamp("2023-01-01")
end   = pd.Timestamp("2026-01-30")

n = len(df_concat)
random_days = np.random.randint(0, (end - start).days + 1, size=n)

df_concat["date"] = start + pd.to_timedelta(random_days, unit="D")

In [0]:
df_concat.head()

### Complaints unified


In [0]:
sdf1 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv("gs://bucket-autoai/complaints_unified.csv")

df3 = sdf1.toPandas()
display(df3)

In [0]:
df3.info()

In [0]:
# Convertir en string les colonnes en object
text_cols = ['complaint_id', 'text', 'vehicle_make', 'vehicle_model', 'source']

for col in text_cols:
    df3[col] = df3[col].astype("string")

In [0]:
df3 = df3.rename(columns={
    "vehicle_make": "make",
    "vehicle_model": "model",
    "vehicle_year": "modelYear",
    "text" :"summary"
    
})

In [0]:
df3.info()

In [0]:
str_cols = df3.select_dtypes(include=["object", "string"]).columns
df3[str_cols] = df3[str_cols].applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [0]:
df3 = df3.drop(columns=["complaint_id"])


In [0]:
df3.head()

### Concatenation df_concat+complaints_unified

In [0]:
df1_concat = pd.concat([df_concat, df3], ignore_index=True)
print(df1_concat)

## Configuration OpenAI

In [0]:
# Utilisation de Databricks secrets par le CLI
!pip install databricks-cli

In [0]:
print(df1_concat)

In [0]:
!databricks workspace ls


In [0]:
!databricks secrets create-scope --scope openai-scope

In [0]:
!databricks secrets list-scopes

In [0]:
openai_api_key = dbutils.secrets.get(
    scope="openai-scope",
    key="openai-api-key"
)

import os
os.environ["OPENAI_API_KEY"] = openai_api_key

print(" Cl√© OpenAI charg√©e depuis Databricks Secrets")

In [0]:
print(df1_concat)

In [0]:
print(df1_concat)

In [0]:
print(df1_concat)

## Prompt GPT pour Trie, Analyse des retours clients et anticipation des probl√®mes

In [0]:
PROMPT_QUALIFICATION = """Tu es un expert en relation client automobile.
Classifie le texte suivant en une seule cat√©gorie :

[plainte, demande, question, avis, reclamation_garantie, autre]

R√®gles :
- R√©ponds uniquement en JSON.
- Ne rajoute aucune explication.
- Si ambigu, r√©ponds "autre".

Format :
{{"type_retour": "...", "confidence": 0.9}}

Texte :
<<<
{text}
>>>
"""

PROMPT_ANALYSE = """Tu es un expert qualit√© automobile.
Analyse la plainte/avis ci-dessous et extrait les informations suivantes :

- composant (freinage, batterie, moteur, electronique, suspension, carrosserie, transmission, climatisation, direction, autre)
- probleme (phrase courte et pr√©cise)
- gravite (faible, moyenne, elevee, critique)
- impact_securite (true/false)
- signal_faible (true/false) - d√©tecte les probl√®mes √©mergents ou inhabituels

R√®gles :
- R√©ponds uniquement en JSON.
- Si une information est manquante, mets null.
- Ne pas inventer.

Format :
{{
 "composant": "...",
 "probleme": "...",
 "gravite": "...",
 "impact_securite": true,
 "signal_faible": false
}}

Texte :
<<<
{text}
>>>
"""

PROMPT_SYNTHESE_TEMPORELLE = """Tu es un expert qualit√© automobile.

Analyse ces donn√©es de r√©clamations group√©es par p√©riode et identifie :
1. Les probl√®mes en augmentation
2. Les nouveaux probl√®mes √©mergents
3. Les composants √† surveiller

Donn√©es :
{data}

Fournis une synth√®se en 3 sections :
- TENDANCES CRITIQUES
- SIGNAUX FAIBLES
- RECOMMANDATIONS

Format professionnel, concis."""

print("Prompts configur√©s")

In [0]:
print(df1_concat)

## Classe LLM Analyzer

In [0]:
class LLMAnalyzer:
    """Analyseur GPT pour retours clients avec cache"""
    
    def __init__(self, client, model="gpt-4o-mini"):
        self.client = client
        self.model = model
        self.cache = {}  # Cache pour √©viter appels r√©p√©t√©s
        self.call_count = 0
        
    def call_gpt(self, prompt, use_cache=True):
        """Appel GPT avec cache"""
        if use_cache and prompt in self.cache:
            return self.cache[prompt]
        
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0
            )
            result = response.choices[0].message.content
            self.call_count += 1
            
            if use_cache:
                self.cache[prompt] = result
            
            return result
        except Exception as e:
            print(f"‚ùå Erreur GPT: {str(e)}")
            return None
    
    def qualify_text(self, text):
        """Qualification du type de retour"""
        prompt = PROMPT_QUALIFICATION.format(text=text[:500])  # Limiter longueur
        result = self.call_gpt(prompt)
        
        if result:
            try:
                # Nettoyer markdown
                result = result.replace("```json", "").replace("```", "").strip()
                return json.loads(result)
            except:
                return {"type_retour": "autre", "confidence": 0.0}
        return {"type_retour": "autre", "confidence": 0.0}
    
    def analyze_text(self, text):
        """Analyse d√©taill√©e du retour"""
        prompt = PROMPT_ANALYSE.format(text=text[:800])
        result = self.call_gpt(prompt)
        
        if result:
            try:
                result = result.replace("```json", "").replace("```", "").strip()
                return json.loads(result)
            except:
                return {
                    "composant": None,
                    "probleme": None,
                    "gravite": None,
                    "impact_securite": False,
                    "signal_faible": False
                }
        return {
            "composant": None,
            "probleme": None,
            "gravite": None,
            "impact_securite": False,
            "signal_faible": False
        }
    
    def generate_temporal_summary(self, temporal_data):
        """G√©n√®re une synth√®se des tendances temporelles"""
        prompt = PROMPT_SYNTHESE_TEMPORELLE.format(
            data=json.dumps(temporal_data, ensure_ascii=False, indent=2, default=str)
        )
        return self.call_gpt(prompt, use_cache=False)
    
    def get_stats(self):
        """Statistiques d'utilisation"""
        return {
            'total_calls': self.call_count,
            'cached_entries': len(self.cache)
        }

print("Classe LLMAnalyzer cr√©√©e")

### Fonction de pretraitement

In [0]:
def clean_text(text):
    """Nettoyage basique"""
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\b[a-z0-9]{8,}\b", "", text) 
    text = re.sub(r"[^a-z√†√¢√ß√©√®√™√´√Æ√Ø√¥√ª√π√º√ø√±√¶≈ì\s]", " ", text)
    return text.strip()

def preprocess_dataframe(df, text_col='text'):
    """Pr√©traitement complet"""
    print("\nüîÑ Pr√©traitement...")
    df = df.copy()
    
    # Nettoyage
    print(f"  Nettoyage colonne: {text_col}")
    df['text_clean'] = df[text_col].apply(clean_text)
    
    # Filtrer textes vides
    initial = len(df)
    df = df[df['text_clean'].str.len() > 10].reset_index(drop=True)
    print(f"  ‚úÖ {len(df)}/{initial} textes valides (> 10 caract√®res)")
    
    return df

print("‚úÖ Fonctions de pr√©traitement cr√©√©es")

### Pr√©traitement et Embeddings

In [0]:
# Charger mod√®le d'embeddings
print("üîÑ Chargement du mod√®le d'embeddings...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("‚úÖ Mod√®le charg√©")

def generate_embeddings(texts):
    """G√©n√®re les embeddings"""
    print(f"\nüîÑ G√©n√©ration embeddings pour {len(texts)} textes...")
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    print(f"  ‚úÖ Shape: {embeddings.shape}")
    return embeddings

def cluster_embeddings(embeddings, n_clusters=8):
    """Clustering K-Means"""
    print(f"\nüîÑ Clustering avec K={n_clusters}...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, clusters)
    print(f"  ‚úÖ Silhouette Score: {score:.3f}")
    return clusters, kmeans

### Fonction d'analyse GPT

In [0]:
def analyze_with_gpt(df, llm_analyzer, sample_size=None):
    """
    Analyse GPT avec batching
    
    Args:
        df: DataFrame avec 'text_clean'
        llm_analyzer: Instance LLMAnalyzer
        sample_size: Nombre √† analyser (None = tous)
    """
    print("\n" + "="*80)
    print("ü§ñ ANALYSE GPT")
    print("="*80)
    
    # √âchantillonner si demand√©
    if sample_size and len(df) > sample_size:
        print(f"‚ö†Ô∏è √âchantillonnage: {sample_size}/{len(df)} textes")
        df_sample = df.sample(n=sample_size, random_state=42).copy()
    else:
        df_sample = df.copy()
    
    print(f"üìä Textes √† analyser: {len(df_sample)}")
    
    # 1. Qualification
    print("\n[1/2] üè∑Ô∏è Qualification des retours...")
    qualifications = []
    for idx, text in enumerate(df_sample['text_clean']):
        if idx % 10 == 0:
            print(f"  Progression: {idx}/{len(df_sample)}", flush=True)
        qualif = llm_analyzer.qualify_text(text)
        qualifications.append(qualif)
    
    df_sample['type_retour'] = [q.get('type_retour') for q in qualifications]
    df_sample['confidence'] = [q.get('confidence', 0.0) for q in qualifications]
    
    print(f"\n  ‚úÖ Qualification termin√©e")
    print(f"  Distribution:")
    print(df_sample['type_retour'].value_counts().to_string())
    
    # 2. Analyse d√©taill√©e (uniquement plaintes/avis)
    print("\n[2/2] üîç Analyse d√©taill√©e...")
    mask = df_sample['type_retour'].isin(['plainte', 'reclamation_garantie', 'avis'])
    df_to_analyze = df_sample[mask]
    
    print(f"  Textes √† analyser en d√©tail: {len(df_to_analyze)}/{len(df_sample)}")
    
    analyses = []
    for idx, text in enumerate(df_to_analyze['text_clean']):
        if idx % 10 == 0:
            print(f"  Progression: {idx}/{len(df_to_analyze)}", flush=True)
        analysis = llm_analyzer.analyze_text(text)
        analyses.append(analysis)
    
    # Initialiser colonnes
    df_sample['composant'] = None
    df_sample['probleme'] = None
    df_sample['gravite'] = None
    df_sample['impact_securite'] = False
    df_sample['signal_faible'] = False
    
    # Remplir uniquement les lignes analys√©es
    if len(analyses) > 0:
        df_sample.loc[mask, 'composant'] = [a.get('composant') for a in analyses]
        df_sample.loc[mask, 'probleme'] = [a.get('probleme') for a in analyses]
        df_sample.loc[mask, 'gravite'] = [a.get('gravite') for a in analyses]
        df_sample.loc[mask, 'impact_securite'] = [a.get('impact_securite', False) for a in analyses]
        df_sample.loc[mask, 'signal_faible'] = [a.get('signal_faible', False) for a in analyses]
    
    stats = llm_analyzer.get_stats()
    print(f"\n‚úÖ Analyse GPT termin√©e")
    print(f"  ‚Ä¢ Appels GPT totaux: {stats['total_calls']}")
    print(f"  ‚Ä¢ Entr√©es en cache: {stats['cached_entries']}")
    
    return df_sample

print("‚úÖ Fonction analyze_with_gpt cr√©√©e")

### Analyse temporelle 

In [0]:
def analyze_temporal_trends(df, freq='M'):
    """
    Analyse les tendances temporelles
    
    Args:
        df: DataFrame avec 'date', 'composant', 'gravite'
        freq: 'M'=mois, 'W'=semaine, 'Q'=trimestre
    """
    print("\n" + "="*80)
    print("üìà ANALYSE TEMPORELLE")
    print("="*80)
    
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    df_temp = df[df['date'].notna()].copy()
    df_temp['periode'] = df_temp['date'].dt.to_period(freq)
    
    # √âvolution globale
    evolution = df_temp.groupby('periode').size()
    
    # Par composant
    evolution_comp = df_temp[df_temp['composant'].notna()].groupby(
        ['periode', 'composant']
    ).size().unstack(fill_value=0)
    
    # Par gravit√©
    evolution_grav = df_temp[df_temp['gravite'].notna()].groupby(
        ['periode', 'gravite']
    ).size().unstack(fill_value=0)
    
    # Signaux faibles
    signaux = df_temp[df_temp['signal_faible'] == True].groupby('periode').size()
    
    # Anomalies
    if len(evolution) > 3:
        mean_val = evolution.mean()
        std_val = evolution.std()
        anomalies = evolution[evolution > mean_val + 2*std_val]
    else:
        anomalies = pd.Series(dtype='int64')
    
    print(f"‚úÖ P√©riodes analys√©es: {len(evolution)}")
    print(f"‚úÖ Anomalies d√©tect√©es: {len(anomalies)}")
    print(f"‚úÖ Signaux faibles: {signaux.sum()}")
    
    return {
        'evolution_globale': evolution,
        'evolution_composant': evolution_comp,
        'evolution_gravite': evolution_grav,
        'signaux_faibles': signaux,
        'anomalies': anomalies
    }

print("‚úÖ Fonction analyze_temporal_trends cr√©√©e")

### Visualisation

In [0]:
def plot_temporal_trends(temporal_results):
    """Visualisation des tendances"""
    
    # √âvolution globale
    fig1 = px.line(
        x=temporal_results['evolution_globale'].index.astype(str),
        y=temporal_results['evolution_globale'].values,
        title="√âvolution temporelle - Volume global",
        labels={'x': 'P√©riode', 'y': 'Nombre de retours'}
    )
    fig1.update_traces(mode='lines+markers')
    fig1.show()
    
    # Top composants
    if not temporal_results['evolution_composant'].empty:
        top_comp = temporal_results['evolution_composant'].sum().nlargest(5).index
        fig2 = go.Figure()
        for comp in top_comp:
            fig2.add_trace(go.Scatter(
                x=temporal_results['evolution_composant'].index.astype(str),
                y=temporal_results['evolution_composant'][comp],
                mode='lines+markers',
                name=comp
            ))
        fig2.update_layout(
            title="Top 5 composants",
            xaxis_title='P√©riode',
            yaxis_title='Nombre'
        )
        fig2.show()

def plot_component_sunburst(df):
    """Sunburst composants"""
    df_filtered = df[df['composant'].notna() & df['gravite'].notna()]
    if len(df_filtered) > 0:
        fig = px.sunburst(
            df_filtered,
            path=['source', 'composant', 'gravite'],
            title='R√©partition Source ‚Üí Composants ‚Üí Gravit√©'
        )
        fig.show()

print("‚úÖ Fonctions de visualisation cr√©√©es")

### Sauvegarde des r√©sultats

In [0]:
def save_results(df, temporal_results, output_path='results/'):
    """Sauvegarde compl√®te"""
    import os
    os.makedirs(output_path, exist_ok=True)
    
    print("\n" + "="*80)
    print("üíæ SAUVEGARDE")
    print("="*80)
    
    # CSV
    df.to_csv(f'{output_path}resultats_complets.csv', index=False)
    print(f"‚úÖ {output_path}resultats_complets.csv")
    
    # Excel
    with pd.ExcelWriter(f'{output_path}analyse_complete.xlsx', engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Donn√©es', index=False)
        
        stats = df.groupby(['source', 'type_retour']).size().unstack(fill_value=0)
        stats.to_excel(writer, sheet_name='Stats')
        
        if df['composant'].notna().any():
            top = df[df['composant'].notna()]['composant'].value_counts().head(20)
            top.to_excel(writer, sheet_name='Top composants')
        
        signaux = df[df['signal_faible'] == True][['date', 'source', 'composant', 'probleme']]
        if len(signaux) > 0:
            signaux.to_excel(writer, sheet_name='Signaux faibles', index=False)
        
        temporal_results['evolution_globale'].to_excel(writer, sheet_name='√âvolution')
    
    print(f"‚úÖ {output_path}analyse_complete.xlsx")
    
    # JSON
    temporal_json = {
        'evolution': temporal_results['evolution_globale'].to_dict(),
        'anomalies': temporal_results['anomalies'].to_dict() if len(temporal_results['anomalies']) > 0 else {},
        'signaux_faibles': int(temporal_results['signaux_faibles'].sum()) if len(temporal_results['signaux_faibles']) > 0 else 0
    }
    with open(f'{output_path}temporal.json', 'w') as f:
        json.dump(temporal_json, f, indent=2, default=str)
    print(f"‚úÖ {output_path}temporal.json")

print("‚úÖ Fonction save_results cr√©√©e")

### Pipeline principale

In [0]:
print("="*80)
print("üöÄ POC IA - ANALYSE RETOURS CLIENTS AUTOMOBILE")
print("="*80)
print(f"\nDate: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

#### Etape 1 : v√©rification des donn√©es 

In [0]:
print("\n[1/6] üì• V√©rification des donn√©es...")

# V√©rifier que df1_concat existe
try:
    print(f"  Type: {type(df1_concat)}")
    print(f"  Lignes: {len(df1_concat)}")
    print(f"  Colonnes: {list(df1_concat.columns)}")
    
    # Copier
    df_all = df1_concat.copy()
    
    # Convertir date si n√©cessaire
    if 'date' in df_all.columns:
        df_all['date'] = pd.to_datetime(df_all['date'], errors='coerce')
    
    print(f"\n‚úÖ {len(df_all)} lignes charg√©es")
    
    # Afficher r√©partition par source
    if 'source' in df_all.columns:
        print(f"\nüìä Par source:")
        print(df_all['source'].value_counts().to_string())
    
    display(df_all.head(5))
    
except NameError:
    print("‚ùå ERREUR: df1_concat n'existe pas")
    print("Assure-toi que df1_concat est charg√© dans ton environnement")
except Exception as e:
    print(f"‚ùå ERREUR: {e}")
    import traceback
    traceback.print_exc()

### Etape 2: Pr√©traitement

In [0]:
print("\n[2/6] üßπ Pr√©traitement...")

# üîß ADAPTER LE NOM DE LA COLONNE TEXTE
TEXT_COLUMN = 'summary'  # Changer si diff√©rent (ex: 'complaint', 'review', 'description')

df_all = preprocess_dataframe(df1_concat, text_col=TEXT_COLUMN)

print("\nüìã Aper√ßu apr√®s pr√©traitement:")
display(df_all[['date', 'source', TEXT_COLUMN, 'text_clean']].head(5))

### Etape 3: Embeddings & Clustering

In [0]:
print("\n[3/6] üî¢ Embeddings & Clustering...")

embeddings = generate_embeddings(df_all['text_clean'].tolist())
df_all['cluster'], kmeans = cluster_embeddings(embeddings, n_clusters=8)

print("\nüìä Distribution des clusters:")
cluster_dist = df_all.groupby(['source', 'cluster']).size().unstack(fill_value=0)
display(cluster_dist)

### Etape 4: Analyse GPT

In [0]:
openai_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

print("‚úÖ Client OpenAI cr√©√© :", type(openai_client))

In [0]:
print("\n[4/6] ü§ñ Analyse GPT...")

# Initialiser l'analyseur
llm_analyzer = LLMAnalyzer(client=openai_client, model="gpt-4o-mini")

# üîß AJUSTER LE SAMPLE SIZE SELON VOTRE BUDGET
SAMPLE_SIZE = 50  # Commencer avec 50 pour tester

print(f"üí∞ Sample size configur√©: {SAMPLE_SIZE}")
print(f"üí° Co√ªt estim√©: ~{SAMPLE_SIZE * 0.01:.2f}‚Ç¨ avec GPT-4o-mini")

# Lancer l'analyse
df_analyzed = analyze_with_gpt(df_all, llm_analyzer, sample_size=SAMPLE_SIZE)

print("\nüìä R√©sultats qualification:")
display(df_analyzed['type_retour'].value_counts())

print("\nüìä R√©sultats composants:")
display(df_analyzed['composant'].value_counts().head(10))

In [0]:
# Exemples d'analyses
print("\nüìã Exemples d'analyse GPT:")
display(df_analyzed[df_analyzed['composant'].notna()][[
    'text_clean', 'type_retour', 'composant', 'probleme', 'gravite', 'signal_faible'
]].head(10))

### √âtape 5 : Analyse temporelle & Visualisations

In [0]:
print(df_analyzed[['date']])
df_analyzed['date']=pd.to_datetime(df_analyzed['date'])
df_analyzed['date']=df_analyzed['date'].dt.date
print(df_analyzed[['date']])

In [0]:
print("\n[5/6] üìà Analyse temporelle...")

temporal_results = analyze_temporal_trends(df_analyzed, freq='M')

print("\nüìä √âvolution globale:")
display(temporal_results['evolution_globale'].to_frame('count'))

if len(temporal_results['anomalies']) > 0:
    print("\n‚ö†Ô∏è Anomalies d√©tect√©es:")
    display(temporal_results['anomalies'].to_frame('count'))

### Etape 6: Synth√®se & Export

In [0]:
print("\n[6/6] üìù Synth√®se ex√©cutive & Export...")

# Pr√©parer donn√©es pour synth√®se
summary_data = {
    'periode': f"{df_analyzed['date'].min()} ‚Üí {df_analyzed['date'].max()}",
    'total': len(df_analyzed),
    'sources': df_analyzed['source'].value_counts().to_dict(),
    'top_composants': df_analyzed[df_analyzed['composant'].notna()]['composant'].value_counts().head(5).to_dict(),
    'critiques': len(df_analyzed[df_analyzed['gravite'] == 'critique']),
    'signaux_faibles': len(df_analyzed[df_analyzed['signal_faible'] == True]),
    'anomalies': len(temporal_results['anomalies'])
}

# G√©n√©rer synth√®se GPT
print("üîÑ G√©n√©ration synth√®se GPT...")
executive_summary = llm_analyzer.generate_temporal_summary(summary_data)

print("\n" + "="*80)
print("üìã SYNTH√àSE EX√âCUTIVE")
print("="*80)
print(executive_summary)
print("="*80)

In [0]:
# Sauvegarder
save_results(df_analyzed, temporal_results, output_path='results/')

### Final

In [0]:
print("\n" + "="*80)
print("‚úÖ PIPELINE TERMIN√â !")
print("="*80)

print("\nüìÇ R√©sultats: results/")
print("\nüìä Fichiers g√©n√©r√©s:")
print("  ‚Ä¢ resultats_complets.csv")
print("  ‚Ä¢ analyse_complete.xlsx")
print("  ‚Ä¢ temporal.json")

print("\nüìà M√©triques:")
print(f"  ‚Ä¢ Retours analys√©s: {len(df_analyzed)}")
print(f"  ‚Ä¢ Composants identifi√©s: {df_analyzed['composant'].nunique()}")
print(f"  ‚Ä¢ Signaux faibles: {df_analyzed['signal_faible'].sum()}")
print(f"  ‚Ä¢ Anomalies temporelles: {len(temporal_results['anomalies'])}")
print(f"  ‚Ä¢ Appels GPT: {llm_analyzer.get_stats()['total_calls']}")

print("\nüéâ POC termin√© avec succ√®s !")