# Pipeline de Rotulagem de Estados Operacionais de Caminh√µes

Este notebook implementa um pipeline completo para identificar e rotular estados operacionais quando a velocidade est√° pr√≥xima de zero, cruzando dados de velocidade, acelera√ß√£o linear, orienta√ß√£o e outros sinais dispon√≠veis.


## C√©lula 0: Configura√ß√£o e Par√¢metros


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Importar utilit√°rios
import sys
sys.path.append('.')
from labeling_utils import (
    get_db_connection, discover_schema, check_data_availability,
    query_data, find_stop_segments, find_moving_segments,
    merge_basculamento_segments, extract_features, classify_stop_segment
)

# Configura√ß√£o de visualiza√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline


: 

In [None]:
# ============================================================
# PAR√ÇMETROS AJUST√ÅVEIS
# ============================================================

V_STOP = 0.5           # km/h - threshold para considerar parado
MIN_STOP_SEC = 30      # dura√ß√£o m√≠nima de parada (segundos)
GAP_SEC = 5            # toler√¢ncia para buracos entre segmentos (segundos)
TH_CONF = 0.6          # threshold m√≠nimo de confian√ßa
WINDOW_SEC = 10        # janela deslizante para features (segundos)

# Par√¢metros para merge de basculamento
MAX_SHORT_MOVE_SEC = 30.0    # dura√ß√£o m√°xima de movimento curto
MAX_SHORT_MOVE_SPEED = 5.0   # velocidade m√°xima para movimento curto

# Par√¢metros para prototipagem
PROTOTYPE_DEVICES = None      # None = todos, ou lista ['TRK-101', ...]
PROTOTYPE_DAYS = 1             # n√∫mero de dias para prototipar

print("‚úÖ Par√¢metros configurados:")
print(f"   V_STOP = {V_STOP} km/h")
print(f"   MIN_STOP_SEC = {MIN_STOP_SEC} s")
print(f"   GAP_SEC = {GAP_SEC} s")
print(f"   TH_CONF = {TH_CONF}")


## C√©lula 1: Schema Discovery e Conex√£o


In [None]:
# Descobrir schema do banco
schema = discover_schema()
print("\nüìä Schema descoberto:")
for category, cols in schema.items():
    print(f"\n{category}:")
    for col in cols:
        print(f"  - {col}")


In [None]:
# Verificar disponibilidade de dados
availability = check_data_availability()

print("\nüìÖ Disponibilidade de Dados:")
print(f"   Primeiro registro: {availability['min_time']}")
print(f"   √öltimo registro:   {availability['max_time']}")
print(f"   Total de registros: {availability['total_records']:,}")
print(f"\nüì± Devices dispon√≠veis ({len(availability['devices'])}):")
for dev in availability['devices']:
    print(f"   - {dev}")

print(f"\n‚úÖ Colunas cr√≠ticas dispon√≠veis:")
critical_cols = ['linear_accel_magnitude', 'pitch', 'roll', 'speed_kmh']
for col in critical_cols:
    status = "‚úÖ" if col in availability['available_columns'] else "‚ùå"
    print(f"   {status} {col}")


## C√©lula 2: Extra√ß√£o de Dados


In [None]:
# Selecionar devices para prototipagem
if PROTOTYPE_DEVICES is None:
    device_ids = availability['devices'][:5]  # Primeiros 5 devices
    print(f"üì± Selecionando primeiros {len(device_ids)} devices para prototipagem")
else:
    device_ids = PROTOTYPE_DEVICES
    print(f"üì± Usando devices especificados: {device_ids}")

print(f"\nDevices: {device_ids}")


In [None]:
# Definir per√≠odo de consulta
t_end = availability['max_time']
t_start = t_end - timedelta(days=PROTOTYPE_DAYS)

print(f"\nüìÖ Per√≠odo de consulta:")
print(f"   In√≠cio: {t_start}")
print(f"   Fim:    {t_end}")
print(f"   Dura√ß√£o: {(t_end - t_start).total_seconds() / 3600:.1f} horas")

# Carregar dados
print("\n‚è≥ Carregando dados do banco...")
df = query_data(device_ids, t_start, t_end)

print(f"\n‚úÖ Dados carregados:")
print(f"   Registros: {len(df):,}")
print(f"   Devices: {df['device_id'].nunique()}")
print(f"   Per√≠odo: {df['time'].min()} at√© {df['time'].max()}")
print(f"\nüìä Colunas dispon√≠veis:")
print(df.columns.tolist())


In [None]:
# Verificar qualidade dos dados
print("\nüîç Qualidade dos Dados:")
print(f"\nValores nulos por coluna cr√≠tica:")
critical_cols_check = ['speed_kmh', 'linear_accel_magnitude', 'pitch', 'roll']
for col in critical_cols_check:
    if col in df.columns:
        null_count = df[col].isna().sum()
        null_pct = (null_count / len(df)) * 100
        print(f"   {col}: {null_count:,} nulos ({null_pct:.1f}%)")
    else:
        print(f"   {col}: ‚ùå COLUNA N√ÉO ENCONTRADA")

# Estat√≠sticas b√°sicas
print(f"\nüìä Estat√≠sticas de velocidade:")
if 'speed_kmh' in df.columns:
    print(df['speed_kmh'].describe())


## C√©lula 3: Segmenta√ß√£o de Estados


In [None]:
# Encontrar segmentos de parada
print("üîç Identificando segmentos de parada...")
stop_segments = find_stop_segments(
    df,
    speed_col='speed_kmh',
    v_stop=V_STOP,
    min_stop_sec=MIN_STOP_SEC,
    gap_sec=GAP_SEC
)

print(f"\n‚úÖ Segmentos de parada encontrados: {len(stop_segments)}")
if len(stop_segments) > 0:
    print(f"\nEstat√≠sticas:")
    print(stop_segments[['duration_s', 'is_stop']].describe())
    print(f"\nPrimeiros segmentos:")
    print(stop_segments.head())


In [None]:
# Encontrar segmentos de movimento
print("üîç Identificando segmentos de movimento...")
moving_segments = find_moving_segments(
    df,
    speed_col='speed_kmh',
    v_stop=V_STOP
)

print(f"\n‚úÖ Segmentos de movimento encontrados: {len(moving_segments)}")
if len(moving_segments) > 0:
    print(f"\nEstat√≠sticas:")
    print(moving_segments[['duration_s', 'is_stop']].describe())
    
    # Estat√≠sticas de velocidade durante movimento
    print(f"\nüìä Velocidade durante movimento:")
    for _, seg in moving_segments.head(5).iterrows():
        seg_data = df[
            (df['device_id'] == seg['device_id']) &
            (df['time'] >= seg['t_start']) &
            (df['time'] <= seg['t_end'])
        ]
        if not seg_data.empty and 'speed_kmh' in seg_data.columns:
            avg_speed = seg_data['speed_kmh'].mean()
            print(f"   {seg['device_id']}: {avg_speed:.1f} km/h (dura√ß√£o: {seg['duration_s']:.1f}s)")


In [None]:
# Merge de basculamento com andadinha curta
print("üîç Aplicando merge de basculamento com andadinha curta...")

merged_stop_segments = merge_basculamento_segments(
    stop_segments,
    moving_segments,
    df,
    max_short_move_sec=MAX_SHORT_MOVE_SEC,
    max_short_move_speed=MAX_SHORT_MOVE_SPEED
)

print(f"\n‚úÖ Merge aplicado")
if 'basculamento_merge' in merged_stop_segments.columns:
    n_merged = merged_stop_segments['basculamento_merge'].sum()
    print(f"   Segmentos marcados para merge: {n_merged}")

# Usar segmentos mergeados daqui em diante
final_stop_segments = merged_stop_segments.copy()


## C√©lula 4: Feature Engineering


In [None]:
# Extrair features para cada segmento de parada
print("üîß Extraindo features dos segmentos de parada...")

segments_with_features = []

for idx, seg in final_stop_segments.iterrows():
    # Obter dados do segmento
    seg_data = df[
        (df['device_id'] == seg['device_id']) &
        (df['time'] >= seg['t_start']) &
        (df['time'] <= seg['t_end'])
    ].copy()
    
    if len(seg_data) < 2:
        continue
    
    # Extrair features
    features = extract_features(seg_data, window_sec=WINDOW_SEC)
    
    # Adicionar informa√ß√µes do segmento
    seg_dict = seg.to_dict()
    seg_dict.update(features)
    
    segments_with_features.append(seg_dict)

segments_df = pd.DataFrame(segments_with_features)

print(f"\n‚úÖ Features extra√≠das para {len(segments_df)} segmentos")
print(f"\nüìä Features dispon√≠veis ({len([c for c in segments_df.columns if c not in ['device_id', 't_start', 't_end', 'duration_s', 'is_stop']])}):")
feature_cols = [c for c in segments_df.columns if c not in ['device_id', 't_start', 't_end', 'duration_s', 'is_stop', 'basculamento_merge', 'merge_t_end']]
for col in feature_cols[:20]:  # Mostrar primeiras 20
    print(f"   - {col}")
if len(feature_cols) > 20:
    print(f"   ... e mais {len(feature_cols) - 20} features")


## C√©lula 5: Sistema de Rotulagem


In [None]:
# Classificar cada segmento
print("üè∑Ô∏è  Classificando segmentos...")

labeled_segments = []

for idx, seg in segments_df.iterrows():
    # Converter features para dict
    features_dict = {}
    for col in feature_cols:
        if col in seg.index:
            val = seg[col]
            if pd.notna(val):
                features_dict[col] = float(val)
    
    # Classificar
    label, confidence, rule_trace = classify_stop_segment(features_dict, th_conf=TH_CONF)
    
    # Adicionar ao resultado
    result = {
        'device_id': seg['device_id'],
        't_start': seg['t_start'],
        't_end': seg['t_end'],
        'duration_s': seg['duration_s'],
        'is_moving': False,  # Todos s√£o paradas
        'label': label,
        'confidence': confidence,
        'rule_trace': rule_trace
    }
    
    # Adicionar features
    result.update(features_dict)
    
    labeled_segments.append(result)

labeled_df = pd.DataFrame(labeled_segments)

print(f"\n‚úÖ Segmentos classificados: {len(labeled_df)}")
print(f"\nüìä Distribui√ß√£o de labels:")
print(labeled_df['label'].value_counts())
print(f"\nüìä Confian√ßa m√©dia por label:")
print(labeled_df.groupby('label')['confidence'].agg(['mean', 'std', 'min', 'max']))


In [None]:
# Adicionar segmentos de movimento ao dataset final
print("üìä Adicionando segmentos de movimento...")

moving_labeled = []
for idx, seg in moving_segments.iterrows():
    moving_labeled.append({
        'device_id': seg['device_id'],
        't_start': seg['t_start'],
        't_end': seg['t_end'],
        'duration_s': seg['duration_s'],
        'is_moving': True,
        'label': 'MOVIMENTO',
        'confidence': 1.0,
        'rule_trace': 'Segmento de movimento (speed > V_STOP)'
    })

moving_df = pd.DataFrame(moving_labeled)

# Combinar paradas e movimento
final_labeled_df = pd.concat([labeled_df, moving_df], ignore_index=True)
final_labeled_df = final_labeled_df.sort_values(['device_id', 't_start']).reset_index(drop=True)

print(f"\n‚úÖ Dataset final criado: {len(final_labeled_df)} segmentos")
print(f"\nüìä Distribui√ß√£o final:")
print(final_labeled_df.groupby(['is_moving', 'label']).size())


## C√©lula 6: Visualiza√ß√µes


In [None]:
# 1. Scatter: speed vs accel_magnitude
fig, ax = plt.subplots(figsize=(12, 8))

if 'speed_kmh' in df.columns and 'linear_accel_magnitude' in df.columns:
    # Amostrar para visualiza√ß√£o (se muitos pontos)
    plot_df = df.sample(min(10000, len(df))) if len(df) > 10000 else df
    
    # Colorir por tipo de segmento
    plot_df['segment_type'] = 'unknown'
    
    for _, seg in final_labeled_df.iterrows():
        mask = (
            (plot_df['device_id'] == seg['device_id']) &
            (plot_df['time'] >= seg['t_start']) &
            (plot_df['time'] <= seg['t_end'])
        )
        plot_df.loc[mask, 'segment_type'] = seg['label']
    
    # Plot
    for label in plot_df['segment_type'].unique():
        if label != 'unknown':
            data = plot_df[plot_df['segment_type'] == label]
            ax.scatter(
                data['speed_kmh'],
                data['linear_accel_magnitude'],
                label=label,
                alpha=0.3,
                s=10
            )
    
    ax.axvline(V_STOP, color='red', linestyle='--', label=f'V_STOP={V_STOP} km/h')
    ax.set_xlabel('Velocidade (km/h)')
    ax.set_ylabel('Acelera√ß√£o Linear Magnitude (m/s¬≤)')
    ax.set_title('Speed vs Acelera√ß√£o Linear - Segmentos Coloridos por Label')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("‚ö†Ô∏è Colunas necess√°rias n√£o encontradas para este gr√°fico")


In [None]:
# 2. Timeline por device com segmentos coloridos
for device_id in final_labeled_df['device_id'].unique()[:3]:  # Primeiros 3 devices
    device_data = df[df['device_id'] == device_id].copy()
    device_segments = final_labeled_df[final_labeled_df['device_id'] == device_id]
    
    if device_data.empty:
        continue
    
    fig, axes = plt.subplots(3, 1, figsize=(16, 10), sharex=True)
    
    # Plot 1: Velocidade
    axes[0].plot(device_data['time'], device_data['speed_kmh'], 'b-', alpha=0.5, linewidth=0.5)
    axes[0].axhline(V_STOP, color='r', linestyle='--', label=f'V_STOP={V_STOP}')
    axes[0].set_ylabel('Velocidade (km/h)')
    axes[0].set_title(f'Timeline - Device {device_id}')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Plot 2: Acelera√ß√£o
    if 'linear_accel_magnitude' in device_data.columns:
        axes[1].plot(device_data['time'], device_data['linear_accel_magnitude'], 'g-', alpha=0.5, linewidth=0.5)
        axes[1].set_ylabel('Acelera√ß√£o Linear (m/s¬≤)')
        axes[1].grid(True, alpha=0.3)
    
    # Plot 3: Segmentos coloridos
    colors_map = {
        'CARREGAMENTO': 'orange',
        'BASCULAMENTO': 'purple',
        'MOTOR_LIGADO': 'green',
        'MOTOR_DESLIGADO': 'gray',
        'MOVIMENTO': 'blue',
        'DESCONHECIDO': 'red'
    }
    
    for _, seg in device_segments.iterrows():
        color = colors_map.get(seg['label'], 'black')
        axes[2].axvspan(seg['t_start'], seg['t_end'], alpha=0.3, color=color, label=seg['label'])
    
    axes[2].set_xlabel('Tempo')
    axes[2].set_ylabel('Segmentos')
    axes[2].set_title('Segmentos Rotulados')
    axes[2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


In [None]:
# 3. Exemplos por classe
print("üìä Exemplos de segmentos por classe:")

for label in labeled_df['label'].unique():
    label_segments = labeled_df[labeled_df['label'] == label]
    
    if len(label_segments) == 0:
        continue
    
    # Pegar exemplo com maior confian√ßa
    example = label_segments.loc[label_segments['confidence'].idxmax()]
    
    seg_data = df[
        (df['device_id'] == example['device_id']) &
        (df['time'] >= example['t_start']) &
        (df['time'] <= example['t_end'])
    ].copy()
    
    if len(seg_data) < 2:
        continue
    
    fig, axes = plt.subplots(3, 1, figsize=(14, 8), sharex=True)
    
    # Plot acelera√ß√£o
    if 'linear_accel_magnitude' in seg_data.columns:
        axes[0].plot(seg_data['time'], seg_data['linear_accel_magnitude'], 'b-', linewidth=1)
        
        # Marcar picos
        if 'peak_count' in example and example['peak_count'] > 0:
            accel_vals = seg_data['linear_accel_magnitude'].values
            from scipy import signal
            peaks, _ = signal.find_peaks(
                accel_vals,
                height=accel_vals.mean() + accel_vals.std()
            )
            axes[0].plot(seg_data['time'].iloc[peaks], accel_vals[peaks], 'ro', markersize=8, label='Picos')
        
        axes[0].set_ylabel('Acelera√ß√£o Linear (m/s¬≤)')
        axes[0].set_title(f"Exemplo: {label} (Confian√ßa: {example['confidence']:.2f})")
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
    
    # Plot pitch
    if 'pitch' in seg_data.columns:
        axes[1].plot(seg_data['time'], seg_data['pitch'], 'g-', linewidth=1)
        axes[1].set_ylabel('Pitch (graus)')
        axes[1].grid(True, alpha=0.3)
    
    # Plot roll
    if 'roll' in seg_data.columns:
        axes[2].plot(seg_data['time'], seg_data['roll'], 'r-', linewidth=1)
        axes[2].set_ylabel('Roll (graus)')
        axes[2].set_xlabel('Tempo')
        axes[2].grid(True, alpha=0.3)
    
    # Adicionar trace de regras
    fig.suptitle(f"{label} - {example['rule_trace'][:100]}...", fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n{label}: {example['rule_trace']}")


In [None]:
# 4. Distribui√ß√£o de features por classe
feature_cols_to_plot = ['accel_mean', 'accel_std', 'accel_rms', 'peak_count', 'pitch_delta_total']
available_feature_cols = [c for c in feature_cols_to_plot if c in labeled_df.columns]

if len(available_feature_cols) > 0:
    n_cols = min(3, len(available_feature_cols))
    n_rows = (len(available_feature_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 5*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    axes = axes.flatten()
    
    for idx, feat_col in enumerate(available_feature_cols):
        ax = axes[idx]
        
        # Boxplot por label
        data_to_plot = []
        labels_list = []
        
        for label in labeled_df['label'].unique():
            label_data = labeled_df[labeled_df['label'] == label][feat_col].dropna()
            if len(label_data) > 0:
                data_to_plot.append(label_data.values)
                labels_list.append(label)
        
        if len(data_to_plot) > 0:
            ax.boxplot(data_to_plot, labels=labels_list)
            ax.set_ylabel(feat_col)
            ax.set_title(f'Distribui√ß√£o de {feat_col} por Label')
            ax.tick_params(axis='x', rotation=45)
            ax.grid(True, alpha=0.3)
    
    # Ocultar eixos extras
    for idx in range(len(available_feature_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("‚ö†Ô∏è Features n√£o dispon√≠veis para plot")


## C√©lula 7: Clustering Explorat√≥rio


In [None]:
# Preparar dados para clustering
print("üîç Preparando dados para clustering...")

# Selecionar features num√©ricas relevantes
clustering_features = [
    'accel_mean', 'accel_std', 'accel_rms', 'accel_energy',
    'peak_count', 'peak_height_mean',
    'pitch_mean', 'pitch_std', 'pitch_delta_total',
    'roll_mean', 'roll_std',
    'energy_ratio_low', 'energy_ratio_high'
]

available_clustering_features = [c for c in clustering_features if c in labeled_df.columns]

if len(available_clustering_features) < 3:
    print("‚ö†Ô∏è Features insuficientes para clustering")
else:
    print(f"‚úÖ Usando {len(available_clustering_features)} features para clustering")
    
    # Preparar matriz
    X = labeled_df[available_clustering_features].fillna(0).values
    
    # Normalizar
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # HDBSCAN
    try:
        import hdbscan
        
        print("\nüîç Executando HDBSCAN...")
        clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=2)
        cluster_labels = clusterer.fit_predict(X_scaled)
        
        labeled_df['cluster'] = cluster_labels
        
        print(f"\n‚úÖ Clustering conclu√≠do:")
        print(f"   Clusters encontrados: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")
        print(f"   Ru√≠do (cluster -1): {(cluster_labels == -1).sum()}")
        print(f"\nüìä Distribui√ß√£o de clusters:")
        print(pd.Series(cluster_labels).value_counts().sort_index())
        
        # Comparar clusters vs labels
        print(f"\nüìä Compara√ß√£o Clusters vs Labels:")
        comparison = pd.crosstab(labeled_df['label'], labeled_df['cluster'])
        print(comparison)
        
    except ImportError:
        print("‚ö†Ô∏è HDBSCAN n√£o instalado. Tentando KMeans...")
        
        from sklearn.cluster import KMeans
        
        n_clusters = min(5, len(labeled_df) // 3)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(X_scaled)
        
        labeled_df['cluster'] = cluster_labels
        
        print(f"\n‚úÖ KMeans conclu√≠do com {n_clusters} clusters")
        print(f"\nüìä Compara√ß√£o Clusters vs Labels:")
        comparison = pd.crosstab(labeled_df['label'], labeled_df['cluster'])
        print(comparison)


In [None]:
# Visualiza√ß√£o de clusters (se tiver 2+ features)
if 'cluster' in labeled_df.columns and len(available_clustering_features) >= 2:
    from sklearn.decomposition import PCA
    
    # Reduzir para 2D com PCA
    pca = PCA(n_components=2)
    X_2d = pca.fit_transform(X_scaled)
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Por cluster
    scatter1 = axes[0].scatter(X_2d[:, 0], X_2d[:, 1], c=labeled_df['cluster'], cmap='tab10', alpha=0.6)
    axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
    axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
    axes[0].set_title('Clusters (HDBSCAN/KMeans)')
    plt.colorbar(scatter1, ax=axes[0])
    axes[0].grid(True, alpha=0.3)
    
    # Plot 2: Por label
    label_map = {label: idx for idx, label in enumerate(labeled_df['label'].unique())}
    label_colors = labeled_df['label'].map(label_map)
    scatter2 = axes[1].scatter(X_2d[:, 0], X_2d[:, 1], c=label_colors, cmap='Set1', alpha=0.6)
    axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} var)')
    axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} var)')
    axes[1].set_title('Labels (Regras)')
    plt.colorbar(scatter2, ax=axes[1])
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Vari√¢ncia explicada por PC: {pca.explained_variance_ratio_}")


## C√©lula 8: Export e Persist√™ncia


In [None]:
# Preparar dataset final para export
print("üíæ Preparando export...")

# Selecionar colunas principais
export_cols = [
    'device_id', 't_start', 't_end', 'duration_s', 'is_moving',
    'label', 'confidence', 'rule_trace'
]

# Adicionar features dispon√≠veis
feature_export_cols = [c for c in final_labeled_df.columns if c not in export_cols]
export_cols_final = export_cols + feature_export_cols

export_df = final_labeled_df[export_cols_final].copy()

print(f"\n‚úÖ Dataset preparado: {len(export_df)} segmentos, {len(export_cols_final)} colunas")
print(f"\nColunas principais:")
for col in export_cols:
    print(f"   - {col}")
print(f"\n... e {len(feature_export_cols)} features")


In [None]:
# Salvar em Parquet
import os
from datetime import datetime

output_dir = 'labeled_segments'
os.makedirs(output_dir, exist_ok=True)

timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')
parquet_path = os.path.join(output_dir, f'labeled_segments_{timestamp_str}.parquet')
csv_path = os.path.join(output_dir, f'labeled_segments_{timestamp_str}.csv')

export_df.to_parquet(parquet_path, index=False)
export_df.to_csv(csv_path, index=False)

print(f"\n‚úÖ Arquivos salvos:")
print(f"   Parquet: {parquet_path}")
print(f"   CSV:     {csv_path}")
print(f"\nüìä Estat√≠sticas finais:")
print(f"   Total de segmentos: {len(export_df)}")
print(f"   Paradas rotuladas: {len(export_df[~export_df['is_moving']])}")
print(f"   Movimento: {len(export_df[export_df['is_moving']])}")
print(f"\nüìä Distribui√ß√£o de labels:")
print(export_df['label'].value_counts())


In [None]:
# Resumo final
print("\n" + "="*60)
print("RESUMO DO PIPELINE")
print("="*60)
print(f"\n‚úÖ Segmentos processados: {len(export_df)}")
print(f"‚úÖ Paradas rotuladas: {len(export_df[~export_df['is_moving']])}")
print(f"‚úÖ Movimento: {len(export_df[export_df['is_moving']])}")
print(f"\nüìä Labels encontrados:")
for label, count in export_df['label'].value_counts().items():
    pct = (count / len(export_df)) * 100
    print(f"   {label}: {count} ({pct:.1f}%)")
print(f"\nüìä Confian√ßa m√©dia: {export_df['confidence'].mean():.2f}")
print(f"üìä Confian√ßa m√≠nima: {export_df['confidence'].min():.2f}")
print(f"\nüíæ Arquivos salvos em: {output_dir}/")
print("="*60)
