# Explora√ß√£o de Dados - Telemetria Veicular

Este notebook documenta uma explora√ß√£o completa dos dados de telemetria para identificar padr√µes que diferenciem estados operacionais do caminh√£o quando parado.

**Abordagem**: Tentativa e erro documentada, com gr√°ficos e conclus√µes expl√≠citas sobre o que funciona e o que n√£o funciona.


## Parte 1: Conhecendo os Dados

### 1.1 Carregar dados e estat√≠sticas gerais


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Importar utilit√°rios
import sys
sys.path.append('.')
from labeling_utils import (
    get_db_connection, discover_schema, check_data_availability,
    query_data
)

# Configura√ß√£o de visualiza√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ Bibliotecas importadas")


: 

In [None]:
# Verificar disponibilidade de dados
availability = check_data_availability()

print("üìÖ DISPONIBILIDADE DE DADOS")
print("="*60)
print(f"Primeiro registro: {availability['min_time']}")
print(f"√öltimo registro:   {availability['max_time']}")
print(f"Total de registros: {availability['total_records']:,}")

duration = availability['max_time'] - availability['min_time']
hours = duration.total_seconds() / 3600
print(f"Dura√ß√£o total: {hours:.1f} horas")

print(f"\nüì± Devices dispon√≠veis ({len(availability['devices'])}):")
for dev in availability['devices']:
    print(f"   - {dev}")

print(f"\n‚úÖ Colunas cr√≠ticas dispon√≠veis:")
critical_cols = ['linear_accel_magnitude', 'pitch', 'roll', 'speed_kmh']
for col in critical_cols:
    status = "‚úÖ" if col in availability['available_columns'] else "‚ùå"
    print(f"   {status} {col}")


: 

In [None]:
# Carregar dados para explora√ß√£o (√∫ltimos 2 dias, primeiros 3 devices)
device_ids = availability['devices'][:3]  # Primeiros 3 devices
t_end = availability['max_time']
t_start = t_end - timedelta(days=2)

print(f"\n‚è≥ Carregando dados...")
print(f"   Devices: {device_ids}")
print(f"   Per√≠odo: {t_start} at√© {t_end}")

df = query_data(device_ids, t_start, t_end)

print(f"\n‚úÖ Dados carregados:")
print(f"   Registros: {len(df):,}")
print(f"   Devices: {df['device_id'].nunique()}")
print(f"   Per√≠odo: {df['time'].min()} at√© {df['time'].max()}")

# Calcular frequ√™ncia de amostragem
if len(df) > 1:
    dt_mean = df['time'].diff().dt.total_seconds().mean()
    fs = 1.0 / dt_mean if dt_mean > 0 else 0
    print(f"   Frequ√™ncia m√©dia: {fs:.2f} Hz (intervalo: {dt_mean:.2f}s)")


: 

In [None]:
# Estat√≠sticas gerais e qualidade dos dados
print("\nüìä ESTAT√çSTICAS GERAIS")
print("="*60)

print("\nValores nulos por coluna cr√≠tica:")
critical_cols_check = ['speed_kmh', 'linear_accel_magnitude', 'pitch', 'roll', 'accel_magnitude']
for col in critical_cols_check:
    if col in df.columns:
        null_count = df[col].isna().sum()
        null_pct = (null_count / len(df)) * 100
        print(f"   {col:25s}: {null_count:6,} nulos ({null_pct:5.1f}%)")
    else:
        print(f"   {col:25s}: ‚ùå COLUNA N√ÉO ENCONTRADA")

print("\nEstat√≠sticas descritivas:")
if 'speed_kmh' in df.columns:
    print("\nVelocidade (km/h):")
    print(df['speed_kmh'].describe())

if 'linear_accel_magnitude' in df.columns:
    print("\nAcelera√ß√£o Linear Magnitude (m/s¬≤):")
    print(df['linear_accel_magnitude'].describe())


### 1.2 Visualizar s√©ries temporais completas


In [None]:
# Visualizar s√©ries temporais completas por device
for device_id in df['device_id'].unique():
    device_data = df[df['device_id'] == device_id].copy().sort_values('time')
    
    fig, axes = plt.subplots(4, 1, figsize=(16, 12), sharex=True)
    
    # 1. Velocidade
    axes[0].plot(device_data['time'], device_data['speed_kmh'], 'b-', alpha=0.6, linewidth=0.5)
    axes[0].axhline(0.5, color='r', linestyle='--', label='0.5 km/h (threshold sugerido)')
    axes[0].set_ylabel('Velocidade (km/h)')
    axes[0].set_title(f'Device {device_id} - S√©ries Temporais Completas')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # 2. Acelera√ß√£o Linear
    if 'linear_accel_magnitude' in device_data.columns:
        axes[1].plot(device_data['time'], device_data['linear_accel_magnitude'], 'g-', alpha=0.6, linewidth=0.5)
        axes[1].set_ylabel('Acelera√ß√£o Linear (m/s¬≤)')
        axes[1].grid(True, alpha=0.3)
    
    # 3. Pitch
    if 'pitch' in device_data.columns:
        axes[2].plot(device_data['time'], device_data['pitch'], 'orange', alpha=0.6, linewidth=0.5)
        axes[2].set_ylabel('Pitch (graus)')
        axes[2].grid(True, alpha=0.3)
    
    # 4. Roll
    if 'roll' in device_data.columns:
        axes[3].plot(device_data['time'], device_data['roll'], 'purple', alpha=0.6, linewidth=0.5)
        axes[3].set_ylabel('Roll (graus)')
        axes[3].set_xlabel('Tempo')
        axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Device {device_id}:")
    print(f"   Registros: {len(device_data):,}")
    if 'speed_kmh' in device_data.columns:
        low_speed = (device_data['speed_kmh'] <= 0.5).sum()
        print(f"   Registros com speed <= 0.5 km/h: {low_speed:,} ({low_speed/len(device_data)*100:.1f}%)")


## Parte 2: Segmenta√ß√£o B√°sica - Parado vs Movimento

### 2.1 Experimento: Threshold de Velocidade

**Objetivo**: Encontrar o melhor threshold para considerar o ve√≠culo "parado".


In [None]:
# Histograma de velocidades baixas
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histograma geral de velocidades baixas
low_speed_data = df[df['speed_kmh'] <= 5.0]['speed_kmh']
axes[0, 0].hist(low_speed_data, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(0.5, color='r', linestyle='--', linewidth=2, label='0.5 km/h')
axes[0, 0].axvline(1.0, color='orange', linestyle='--', linewidth=2, label='1.0 km/h')
axes[0, 0].axvline(2.0, color='yellow', linestyle='--', linewidth=2, label='2.0 km/h')
axes[0, 0].set_xlabel('Velocidade (km/h)')
axes[0, 0].set_ylabel('Frequ√™ncia')
axes[0, 0].set_title('Distribui√ß√£o de Velocidades Baixas (‚â§ 5 km/h)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Zoom em velocidades muito baixas
very_low_speed_data = df[df['speed_kmh'] <= 2.0]['speed_kmh']
axes[0, 1].hist(very_low_speed_data, bins=40, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(0.5, color='r', linestyle='--', linewidth=2, label='0.5 km/h')
axes[0, 1].set_xlabel('Velocidade (km/h)')
axes[0, 1].set_ylabel('Frequ√™ncia')
axes[0, 1].set_title('Zoom: Velocidades ‚â§ 2 km/h')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Comparar diferentes thresholds
thresholds = [0.5, 1.0, 2.0, 3.0]
threshold_counts = []
for thresh in thresholds:
    count = (df['speed_kmh'] <= thresh).sum()
    threshold_counts.append(count)

axes[1, 0].bar(range(len(thresholds)), threshold_counts, alpha=0.7, color=['red', 'orange', 'yellow', 'green'])
axes[1, 0].set_xticks(range(len(thresholds)))
axes[1, 0].set_xticklabels([f'{t} km/h' for t in thresholds])
axes[1, 0].set_ylabel('N√∫mero de Registros')
axes[1, 0].set_title('Registros Considerados "Parado" por Threshold')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Percentual do total
threshold_pcts = [c/len(df)*100 for c in threshold_counts]
axes[1, 1].bar(range(len(thresholds)), threshold_pcts, alpha=0.7, color=['red', 'orange', 'yellow', 'green'])
axes[1, 1].set_xticks(range(len(thresholds)))
axes[1, 1].set_xticklabels([f'{t} km/h' for t in thresholds])
axes[1, 1].set_ylabel('Percentual do Total (%)')
axes[1, 1].set_title('Percentual de Registros "Parado" por Threshold')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nüìä CONCLUS√ÉO - Threshold de Velocidade:")
print("="*60)
for i, thresh in enumerate(thresholds):
    print(f"   {thresh} km/h: {threshold_counts[i]:,} registros ({threshold_pcts[i]:.1f}%)")
print("\nüí° Observa√ß√£o: Threshold muito baixo pode perder paradas reais,")
print("   threshold muito alto pode incluir movimento lento.")


In [None]:
# Visualizar timeline com diferentes thresholds
device_id = df['device_id'].iloc[0]
device_data = df[df['device_id'] == device_id].copy().sort_values('time')

fig, axes = plt.subplots(len(thresholds), 1, figsize=(16, 3*len(thresholds)), sharex=True)

for idx, thresh in enumerate(thresholds):
    ax = axes[idx]
    
    # Plot velocidade
    ax.plot(device_data['time'], device_data['speed_kmh'], 'b-', alpha=0.5, linewidth=0.5)
    
    # Marcar zonas de parada
    is_stopped = device_data['speed_kmh'] <= thresh
    ax.fill_between(device_data['time'], 0, device_data['speed_kmh'].max(), 
                     where=is_stopped, alpha=0.3, color='red', label=f'Parado (‚â§{thresh} km/h)')
    
    ax.axhline(thresh, color='r', linestyle='--', linewidth=2)
    ax.set_ylabel(f'Velocidade\n(km/h)')
    ax.set_title(f'Threshold = {thresh} km/h')
    ax.legend()
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel('Tempo')
plt.tight_layout()
plt.show()

print("\nüí° Visualmente, podemos ver como diferentes thresholds capturam per√≠odos de parada.")


### 2.2 Experimento: Dura√ß√£o M√≠nima de Parada

**Objetivo**: Encontrar a dura√ß√£o m√≠nima que faz sentido para considerar um per√≠odo como "parada" (evitar ru√≠do).


In [None]:
from labeling_utils import find_stop_segments

# Testar diferentes dura√ß√µes m√≠nimas
V_STOP = 0.5  # Usar threshold de 0.5 km/h
min_durations = [10, 30, 60, 120]  # segundos

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

segment_stats = []

for min_dur in min_durations:
    segments = find_stop_segments(df, speed_col='speed_kmh', v_stop=V_STOP, 
                                   min_stop_sec=min_dur, gap_sec=5.0)
    
    if len(segments) > 0:
        segment_stats.append({
            'min_duration': min_dur,
            'n_segments': len(segments),
            'mean_duration': segments['duration_s'].mean(),
            'median_duration': segments['duration_s'].median(),
            'min_duration_actual': segments['duration_s'].min(),
            'max_duration': segments['duration_s'].max(),
            'total_time': segments['duration_s'].sum()
        })
    else:
        segment_stats.append({
            'min_duration': min_dur,
            'n_segments': 0,
            'mean_duration': 0,
            'median_duration': 0,
            'min_duration_actual': 0,
            'max_duration': 0,
            'total_time': 0
        })

stats_df = pd.DataFrame(segment_stats)

# Gr√°fico 1: N√∫mero de segmentos
axes[0, 0].bar(stats_df['min_duration'], stats_df['n_segments'], alpha=0.7, color='blue')
axes[0, 0].set_xlabel('Dura√ß√£o M√≠nima (segundos)')
axes[0, 0].set_ylabel('N√∫mero de Segmentos')
axes[0, 0].set_title('N√∫mero de Segmentos de Parada por Dura√ß√£o M√≠nima')
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Gr√°fico 2: Dura√ß√£o m√©dia dos segmentos
axes[0, 1].bar(stats_df['min_duration'], stats_df['mean_duration'], alpha=0.7, color='green')
axes[0, 1].set_xlabel('Dura√ß√£o M√≠nima (segundos)')
axes[0, 1].set_ylabel('Dura√ß√£o M√©dia (segundos)')
axes[0, 1].set_title('Dura√ß√£o M√©dia dos Segmentos')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Gr√°fico 3: Tempo total em parada
axes[1, 0].bar(stats_df['min_duration'], stats_df['total_time']/3600, alpha=0.7, color='orange')
axes[1, 0].set_xlabel('Dura√ß√£o M√≠nima (segundos)')
axes[1, 0].set_ylabel('Tempo Total (horas)')
axes[1, 0].set_title('Tempo Total em Parada')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Gr√°fico 4: Distribui√ß√£o de dura√ß√µes (para min_duration=30s)
segments_30s = find_stop_segments(df, speed_col='speed_kmh', v_stop=V_STOP, 
                                    min_stop_sec=30, gap_sec=5.0)
if len(segments_30s) > 0:
    axes[1, 1].hist(segments_30s['duration_s'], bins=30, edgecolor='black', alpha=0.7)
    axes[1, 1].axvline(30, color='r', linestyle='--', linewidth=2, label='M√≠nimo (30s)')
    axes[1, 1].set_xlabel('Dura√ß√£o do Segmento (segundos)')
    axes[1, 1].set_ylabel('Frequ√™ncia')
    axes[1, 1].set_title('Distribui√ß√£o de Dura√ß√µes (min=30s)')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä CONCLUS√ÉO - Dura√ß√£o M√≠nima:")
print("="*60)
print(stats_df.to_string(index=False))
print("\nüí° Observa√ß√£o: Dura√ß√£o muito baixa captura ru√≠do,")
print("   dura√ß√£o muito alta pode perder paradas curtas leg√≠timas.")


In [None]:
# Isolar segmentos de parada e visualizar acelera√ß√£o
V_STOP = 0.5
MIN_STOP_SEC = 30
stop_segments = find_stop_segments(df, speed_col='speed_kmh', v_stop=V_STOP, 
                                    min_stop_sec=MIN_STOP_SEC, gap_sec=5.0)

print(f"üìä Segmentos de parada encontrados: {len(stop_segments)}")

# Visualizar primeiros 10 segmentos
n_segments_to_show = min(10, len(stop_segments))

for idx, seg in stop_segments.head(n_segments_to_show).iterrows():
    seg_data = df[
        (df['device_id'] == seg['device_id']) &
        (df['time'] >= seg['t_start']) &
        (df['time'] <= seg['t_end'])
    ].copy().sort_values('time')
    
    if len(seg_data) < 2 or 'linear_accel_magnitude' not in seg_data.columns:
        continue
    
    fig, axes = plt.subplots(2, 1, figsize=(14, 6), sharex=True)
    
    # Plot velocidade
    axes[0].plot(seg_data['time'], seg_data['speed_kmh'], 'b-', linewidth=1)
    axes[0].axhline(V_STOP, color='r', linestyle='--', label=f'Threshold ({V_STOP} km/h)')
    axes[0].set_ylabel('Velocidade (km/h)')
    axes[0].set_title(f'Segmento {idx+1} - Device {seg["device_id"]} - Dura√ß√£o: {seg["duration_s"]:.1f}s')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Plot acelera√ß√£o linear
    axes[1].plot(seg_data['time'], seg_data['linear_accel_magnitude'], 'g-', linewidth=1)
    axes[1].set_ylabel('Acelera√ß√£o Linear (m/s¬≤)')
    axes[1].set_xlabel('Tempo')
    axes[1].grid(True, alpha=0.3)
    
    # Estat√≠sticas
    accel_mean = seg_data['linear_accel_magnitude'].mean()
    accel_std = seg_data['linear_accel_magnitude'].std()
    axes[1].axhline(accel_mean, color='orange', linestyle='--', label=f'M√©dia: {accel_mean:.3f}')
    axes[1].axhline(accel_mean + accel_std, color='red', linestyle=':', alpha=0.5, label=f'¬±1 std')
    axes[1].axhline(accel_mean - accel_std, color='red', linestyle=':', alpha=0.5)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n   Segmento {idx+1}:")
    print(f"      Dura√ß√£o: {seg['duration_s']:.1f}s")
    print(f"      Accel m√©dia: {accel_mean:.3f} m/s¬≤")
    print(f"      Accel std: {accel_std:.3f} m/s¬≤")
    print(f"      Accel min: {seg_data['linear_accel_magnitude'].min():.3f} m/s¬≤")
    print(f"      Accel max: {seg_data['linear_accel_magnitude'].max():.3f} m/s¬≤")


### 3.2 Experimento: Estat√≠sticas de Acelera√ß√£o

**Objetivo**: Verificar se estat√≠sticas simples (mean, std, RMS, energia) conseguem separar diferentes tipos de parada.
