# 🔍 Weaviate Explorer Dashboard

Dashboard interattiva per esplorare il database vettoriale Weaviate con articoli di news.


## 📦 Setup e Imports

In [None]:
import sys
import os
sys.path.append('/home/jovyan/work/src')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
from datetime import datetime, timedelta
import json
import re
from collections import Counter

# Configurazione visualizzazioni
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("📦 Imports completati!")

## 🔌 Connessione a Weaviate

In [None]:
import weaviate
from weaviate.exceptions import WeaviateException

# Configurazione connessione
WEAVIATE_URL = "http://weaviate:8080"  # URL interno al container
INDEX_NAME = "NewsArticles_DEV"

try:
    # Connessione a Weaviate
    client = weaviate.Client(url=WEAVIATE_URL)
    
    # Test connessione
    if client.is_ready():
        print("✅ Connessione a Weaviate riuscita!")
        
        # Info schema
        schema = client.schema.get()
        classes = [cls['class'] for cls in schema.get('classes', [])]
        print(f"📋 Classi disponibili: {classes}")
        
        # Conta oggetti
        if INDEX_NAME in classes:
            result = client.query.aggregate(INDEX_NAME).with_meta_count().do()
            count = result['data']['Aggregate'][INDEX_NAME][0]['meta']['count']
            print(f"📊 Articoli nel database: {count:,}")
        else:
            print(f"⚠️  Classe {INDEX_NAME} non trovata")
    else:
        print("❌ Weaviate non è ready")
        
except Exception as e:
    print(f"❌ Errore connessione Weaviate: {e}")
    client = None

## 📊 Statistiche Generali

In [None]:
def get_articles_stats():
    """Ottiene statistiche generali sugli articoli"""
    if not client:
        return None
    
    try:
        # Query per ottenere tutti gli articoli con metadati
        result = client.query.get(
            INDEX_NAME, 
            ['title', 'domain', 'source', 'published_date', 'url', 'quality_score']
        ).with_limit(1000).do()
        
        articles = result['data']['Get'][INDEX_NAME]
        
        if not articles:
            print("📭 Nessun articolo trovato")
            return None
        
        # Converti in DataFrame
        df = pd.DataFrame(articles)
        
        # Pulizia dati
        if 'published_date' in df.columns:
            df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
            df['date'] = df['published_date'].dt.date
        
        if 'quality_score' in df.columns:
            df['quality_score'] = pd.to_numeric(df['quality_score'], errors='coerce')
        
        return df
        
    except Exception as e:
        print(f"❌ Errore nel recupero statistiche: {e}")
        return None

# Carica dati
print("🔄 Caricamento articoli in corso...")
df_articles = get_articles_stats()

if df_articles is not None:
    print(f"✅ Caricati {len(df_articles):,} articoli")
    print(f"📅 Periodo: {df_articles['date'].min()} - {df_articles['date'].max()}")
    print(f"🏷️  Domini: {df_articles['domain'].nunique()}")
    print(f"📰 Fonti: {df_articles['source'].nunique()}")
else:
    print("❌ Impossibile caricare i dati")

## 📈 Visualizzazioni Interattive

In [None]:
if df_articles is not None:
    # 1. Distribuzione per Dominio
    domain_counts = df_articles['domain'].value_counts()
    
    fig_domain = px.pie(
        values=domain_counts.values,
        names=domain_counts.index,
        title="📊 Distribuzione Articoli per Dominio",
        color_discrete_sequence=px.colors.qualitative.Set3
    )
    fig_domain.show()
    
    # 2. Articoli nel Tempo
    if 'date' in df_articles.columns:
        daily_counts = df_articles.groupby('date').size().reset_index()
        daily_counts.columns = ['date', 'count']
        
        fig_time = px.line(
            daily_counts,
            x='date',
            y='count',
            title="📅 Articoli Pubblicati nel Tempo",
            markers=True
        )
        fig_time.show()
    
    # 3. Top Fonti
    source_counts = df_articles['source'].value_counts().head(10)
    
    fig_sources = px.bar(
        x=source_counts.values,
        y=source_counts.index,
        orientation='h',
        title="📰 Top 10 Fonti per Numero Articoli",
        color=source_counts.values,
        color_continuous_scale='viridis'
    )
    fig_sources.show()
    
    # 4. Quality Score Distribution
    if 'quality_score' in df_articles.columns and df_articles['quality_score'].notna().any():
        fig_quality = px.histogram(
            df_articles,
            x='quality_score',
            title="📈 Distribuzione Quality Score",
            nbins=20
        )
        fig_quality.show()
else:
    print("❌ Nessun dato disponibile per le visualizzazioni")

## 🔍 Explorer Interattivo

In [None]:
if df_articles is not None:
    # Widget per filtri
    domain_filter = widgets.SelectMultiple(
        options=list(df_articles['domain'].unique()),
        value=list(df_articles['domain'].unique()),
        description='Domini:',
        disabled=False
    )
    
    source_filter = widgets.SelectMultiple(
        options=list(df_articles['source'].unique()),
        value=list(df_articles['source'].unique())[:5],  # Primi 5
        description='Fonti:',
        disabled=False
    )
    
    # Date picker se abbiamo le date
    if 'date' in df_articles.columns:
        min_date = df_articles['date'].min()
        max_date = df_articles['date'].max()
        
        date_range = widgets.SelectionRangeSlider(
            options=pd.date_range(min_date, max_date, freq='D').tolist(),
            index=(0, len(pd.date_range(min_date, max_date, freq='D')) - 1),
            description='Periodo:',
            disabled=False
        )
    
    # Numero di risultati
    limit_slider = widgets.IntSlider(
        value=20,
        min=10,
        max=100,
        step=10,
        description='Risultati:'
    )
    
    # Output area
    output = widgets.Output()
    
    def update_results(*args):
        with output:
            clear_output(wait=True)
            
            # Applica filtri
            filtered_df = df_articles[
                (df_articles['domain'].isin(domain_filter.value)) &
                (df_articles['source'].isin(source_filter.value))
            ]
            
            # Filtro date se disponibile
            if 'date' in df_articles.columns and hasattr(date_range, 'value'):
                start_date, end_date = date_range.value
                filtered_df = filtered_df[
                    (filtered_df['date'] >= start_date.date()) &
                    (filtered_df['date'] <= end_date.date())
                ]
            
            # Limita risultati
            filtered_df = filtered_df.head(limit_slider.value)
            
            print(f"📊 Risultati filtrati: {len(filtered_df):,} articoli")
            
            if len(filtered_df) > 0:
                # Mostra tabella
                display_columns = ['title', 'domain', 'source', 'date']
                if 'quality_score' in filtered_df.columns:
                    display_columns.append('quality_score')
                
                display(filtered_df[display_columns])
            else:
                print("❌ Nessun articolo trovato con i filtri selezionati")
    
    # Collega eventi
    domain_filter.observe(update_results, names='value')
    source_filter.observe(update_results, names='value')
    limit_slider.observe(update_results, names='value')
    
    if 'date' in df_articles.columns:
        date_range.observe(update_results, names='value')
    
    # Layout
    filters = widgets.VBox([
        widgets.HTML("<h3>🔧 Filtri</h3>"),
        domain_filter,
        source_filter,
        date_range if 'date' in df_articles.columns else widgets.HTML(""),
        limit_slider
    ])
    
    dashboard = widgets.HBox([filters, output])
    display(dashboard)
    
    # Caricamento iniziale
    update_results()
else:
    print("❌ Nessun dato disponibile per l'explorer")

## 🔎 Ricerca Semantica

In [None]:
def semantic_search(query, limit=10):
    """Ricerca semantica negli articoli"""
    if not client:
        print("❌ Client Weaviate non disponibile")
        return None
    
    try:
        result = client.query.get(
            INDEX_NAME,
            ['title', 'content', 'domain', 'source', 'published_date', 'url']
        ).with_near_text({
            'concepts': [query]
        }).with_limit(limit).with_additional(['distance']).do()
        
        articles = result['data']['Get'][INDEX_NAME]
        
        if not articles:
            print(f"🔍 Nessun risultato per: '{query}'")
            return None
        
        # Converti in DataFrame
        search_df = pd.DataFrame(articles)
        
        # Estrai distance score
        search_df['similarity'] = [1 - float(item['_additional']['distance']) for item in articles]
        search_df['similarity'] = search_df['similarity'].round(3)
        
        return search_df
        
    except Exception as e:
        print(f"❌ Errore ricerca semantica: {e}")
        return None

# Widget per ricerca
search_input = widgets.Text(
    value='calcio serie a',
    placeholder='Inserisci query di ricerca...',
    description='Ricerca:',
    style={'description_width': 'initial'}
)

search_limit = widgets.IntSlider(
    value=10,
    min=5,
    max=50,
    step=5,
    description='Risultati:'
)

search_button = widgets.Button(
    description='🔍 Cerca',
    button_style='info'
)

search_output = widgets.Output()

def on_search_click(b):
    with search_output:
        clear_output(wait=True)
        query = search_input.value.strip()
        
        if not query:
            print("⚠️ Inserisci una query di ricerca")
            return
        
        print(f"🔍 Ricerca per: '{query}'...")
        results = semantic_search(query, search_limit.value)
        
        if results is not None:
            print(f"✅ Trovati {len(results)} risultati")
            
            # Mostra risultati
            display_cols = ['title', 'domain', 'source', 'similarity']
            display(results[display_cols])
            
            # Dettagli primo risultato
            if len(results) > 0:
                print("\n📄 Primo risultato:")
                first = results.iloc[0]
                print(f"Titolo: {first['title']}")
                print(f"Fonte: {first['source']} | Dominio: {first['domain']}")
                print(f"Similarità: {first['similarity']}")
                if 'content' in first and first['content']:
                    content_preview = first['content'][:300] + "..." if len(first['content']) > 300 else first['content']
                    print(f"Contenuto: {content_preview}")

search_button.on_click(on_search_click)

# Layout ricerca
search_box = widgets.VBox([
    widgets.HTML("<h3>🔎 Ricerca Semantica</h3>"),
    widgets.HBox([search_input, search_limit, search_button]),
    search_output
])

display(search_box)

## 💾 Export e Utilità

In [None]:
if df_articles is not None:
    # Bottoni per export
    export_csv_btn = widgets.Button(description='📄 Export CSV', button_style='success')
    export_json_btn = widgets.Button(description='📋 Export JSON', button_style='warning')
    
    export_output = widgets.Output()
    
    def export_csv(b):
        with export_output:
            clear_output(wait=True)
            filename = f"weaviate_articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            df_articles.to_csv(filename, index=False)
            print(f"✅ Esportato in {filename} ({len(df_articles):,} righe)")
    
    def export_json(b):
        with export_output:
            clear_output(wait=True)
            filename = f"weaviate_articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            df_articles.to_json(filename, orient='records', date_format='iso')
            print(f"✅ Esportato in {filename} ({len(df_articles):,} record)")
    
    export_csv_btn.on_click(export_csv)
    export_json_btn.on_click(export_json)
    
    # Statistiche rapide
    stats_btn = widgets.Button(description='📊 Statistiche', button_style='info')
    
    def show_stats(b):
        with export_output:
            clear_output(wait=True)
            print("📊 STATISTICHE DATASET")
            print("=" * 40)
            print(f"📰 Totale articoli: {len(df_articles):,}")
            print(f"🏷️  Domini unici: {df_articles['domain'].nunique()}")
            print(f"📰 Fonti uniche: {df_articles['source'].nunique()}")
            
            if 'date' in df_articles.columns:
                print(f"📅 Periodo: {df_articles['date'].min()} - {df_articles['date'].max()}")
                print(f"📈 Giorni coperti: {df_articles['date'].nunique()}")
            
            if 'quality_score' in df_articles.columns:
                qs_stats = df_articles['quality_score'].describe()
                print(f"⭐ Quality Score medio: {qs_stats['mean']:.3f}")
                print(f"⭐ Quality Score mediano: {qs_stats['50%']:.3f}")
            
            print("\n🏆 TOP DOMINI:")
            for domain, count in df_articles['domain'].value_counts().head(5).items():
                print(f"  {domain}: {count:,} articoli")
            
            print("\n📰 TOP FONTI:")
            for source, count in df_articles['source'].value_counts().head(5).items():
                print(f"  {source}: {count:,} articoli")
    
    stats_btn.on_click(show_stats)
    
    # Layout export
    export_box = widgets.VBox([
        widgets.HTML("<h3>💾 Export e Utilità</h3>"),
        widgets.HBox([export_csv_btn, export_json_btn, stats_btn]),
        export_output
    ])
    
    display(export_box)
else:
    print("❌ Nessun dato disponibile per l'export")

## 🎯 Conclusioni

Questa dashboard ti permette di:

- 📊 **Esplorare** i dati del database Weaviate
- 🔍 **Cercare** articoli con ricerca semantica
- 📈 **Visualizzare** statistiche e distribuzioni
- 🔧 **Filtrare** risultati per dominio, fonte, data
- 💾 **Esportare** dati in CSV/JSON

### 🚀 Prossimi Passi:

1. Esplora gli altri notebook nella cartella
2. Personalizza visualizzazioni per le tue esigenze
3. Aggiungi nuove analisi e metriche
4. Integra con altri sistemi di monitoraggio