## 1Ô∏è‚É£ Setup - Clonar Reposit√≥rio e Verificar GPU

In [None]:
# Clonar reposit√≥rio do GitHub
!git clone https://github.com/gabrielamds/k-means-1d.git
%cd k-means-1d

# Verificar GPU dispon√≠vel
!nvidia-smi --query-gpu=name,memory.total,compute_cap --format=csv

## 2Ô∏è‚É£ Gerar Datasets

Gera os mesmos dados usados no Windows (seeds 42, 43, 44):

In [None]:
# Gerar os 3 datasets
%cd data
!python3 generate_data.py --N 10000 --K 4 --output dados_pequeno --seed 42
!python3 generate_data.py --N 100000 --K 8 --output dados_medio --seed 43
!python3 generate_data.py --N 1000000 --K 16 --output dados_grande --seed 44

# Confirmar que foram criados
print("\nDatasets criados:")
!ls -lh *.csv
%cd ..

## 3Ô∏è‚É£ Compilar CUDA

In [None]:
%%bash
cd cuda
nvcc -O2 -arch=sm_75 kmeans_1d_cuda.cu -o kmeans_1d_cuda
echo "‚úì CUDA compilado com sucesso"
ls -lh kmeans_1d_cuda

---

# üìä PARTE 1: CUDA PURO

## Benchmark CUDA - 3 Datasets

In [None]:
%%bash
echo "========================================="
echo "BENCHMARK: CUDA"
echo "========================================="

echo ""
echo "--- Dataset PEQUENO (10K, K=4) ---"
cuda/kmeans_1d_cuda data/dados_pequeno.csv data/dados_pequeno_centroides_init.csv 50 1e-6 256

echo ""
echo "--- Dataset M√âDIO (100K, K=8) ---"
cuda/kmeans_1d_cuda data/dados_medio.csv data/dados_medio_centroides_init.csv 50 1e-6 256

echo ""
echo "--- Dataset GRANDE (1M, K=16) ---"
cuda/kmeans_1d_cuda data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6 256

## Testar Diferentes Configura√ß√µes (Threads per Block)

In [None]:
%%bash
echo "=== Variando Threads per Block (Dataset Grande) ==="
for TPB in 128 256 512 1024; do
    echo ""
    echo "Threads/Block: $TPB"
    cuda/kmeans_1d_cuda data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6 $TPB | grep "Tempo"
done

---

# üìä PARTE 2: HYBRID (OpenMP + CUDA)

## Compilar OpenMP + CUDA

In [None]:
%%bash
cd hybrid
nvcc -O2 -arch=sm_75 -Xcompiler -fopenmp kmeans_1d_omp_cuda.cu -o kmeans_1d_omp_cuda
echo "‚úì OpenMP + CUDA compilado"
ls -lh kmeans_1d_omp_cuda

## Benchmark OpenMP + CUDA

In [None]:
%%bash
echo "========================================="
echo "BENCHMARK: HYBRID (OpenMP + CUDA)"
echo "========================================="

datasets=("pequeno:10K:4" "medio:100K:8" "grande:1M:16")

for ds in "${datasets[@]}"; do
    IFS=':' read -r name N K <<< "$ds"
    echo ""
    echo "=== Dataset: ${name^^} (N=$N, K=$K) ==="
    
    for THREADS in 1 2 4 8; do
        echo "  OpenMP $THREADS threads + CUDA:"
        OMP_NUM_THREADS=$THREADS hybrid/kmeans_1d_omp_cuda \
            data/dados_$name.csv data/dados_${name}_centroides_init.csv \
            50 1e-6 256 static 0 | grep "Tempo"
    done
done

---

# üìä PARTE 4: HYBRID (MPI + CUDA)

In [None]:
%%bash
echo "========================================="
echo "BENCHMARK: HYBRID (OpenMP + MPI)"
echo "========================================="

datasets=("pequeno:10K:4" "medio:100K:8" "grande:1M:16")

for ds in "${datasets[@]}"; do
    IFS=':' read -r name N K <<< "$ds"
    echo ""
    echo "=== Dataset: ${name^^} (N=$N, K=$K) ==="
    
    echo "  1 processo MPI x 2 threads OpenMP:"
    OMP_NUM_THREADS=2 mpirun -np 1 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
    
    echo "  2 processos MPI x 1 thread OpenMP:"
    OMP_NUM_THREADS=1 mpirun -np 2 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
    
    echo "  2 processos MPI x 2 threads OpenMP:"
    OMP_NUM_THREADS=2 mpirun -np 2 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
done

## Benchmark OpenMP + MPI

In [None]:
%%bash
cd hybrid
mpicc -O2 -fopenmp kmeans_1d_omp_mpi.c -o kmeans_1d_omp_mpi -lm
echo "‚úì OpenMP + MPI compilado"
ls -lh kmeans_1d_omp_mpi

## Compilar OpenMP + MPI

---

# üìä PARTE 3: HYBRID (OpenMP + MPI) - CPU apenas

## Instalar OpenMPI

In [None]:
%%bash
echo "========================================="
echo "BENCHMARK: HYBRID (OpenMP + MPI)"
echo "========================================="

datasets=("pequeno:10K:4" "medio:100K:8" "grande:1M:16")

for ds in "${datasets[@]}"; do
    IFS=':' read -r name N K <<< "$ds"
    echo ""
    echo "=== Dataset: ${name^^} (N=$N, K=$K) ==="
    
    # Teste com diferentes combina√ß√µes MPI x OpenMP
    echo ""
    echo "  1 processo MPI x 2 threads OpenMP:"
    OMP_NUM_THREADS=2 mpirun -np 1 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
    
    echo "  2 processos MPI x 1 thread OpenMP:"
    OMP_NUM_THREADS=1 mpirun -np 2 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
    
    echo "  2 processos MPI x 2 threads OpenMP:"
    OMP_NUM_THREADS=2 mpirun -np 2 --allow-run-as-root --oversubscribe \
        hybrid/kmeans_1d_omp_mpi \
        data/dados_$name.csv data/dados_${name}_centroides_init.csv \
        50 1e-6 | grep "Tempo"
done

In [None]:
!apt-get update -qq
!apt-get install -y openmpi-bin libopenmpi-dev -qq
print("‚úì OpenMPI instalado")

## Compilar MPI + CUDA

In [None]:
%%bash
cd hybrid
nvcc -O2 -arch=sm_75 -I/usr/lib/x86_64-linux-gnu/openmpi/include \
    kmeans_1d_mpi_cuda.cu -o kmeans_1d_mpi_cuda \
    -L/usr/lib/x86_64-linux-gnu/openmpi/lib -lmpi
echo "‚úì MPI + CUDA compilado"
ls -lh kmeans_1d_mpi_cuda

## Benchmark MPI + CUDA

In [None]:
%%bash
echo "========================================="
echo "BENCHMARK: HYBRID (MPI + CUDA)"
echo "========================================="

datasets=("pequeno:10K:4" "medio:100K:8" "grande:1M:16")

for ds in "${datasets[@]}"; do
    IFS=':' read -r name N K <<< "$ds"
    echo ""
    echo "=== Dataset: ${name^^} (N=$N, K=$K) ==="
    
    for PROCS in 1 2 4; do
        echo "  MPI $PROCS processos + CUDA:"
        mpirun -np $PROCS --allow-run-as-root hybrid/kmeans_1d_mpi_cuda \
            data/dados_$name.csv data/dados_${name}_centroides_init.csv \
            50 1e-6 256 | grep "Tempo"
    done
done

---

# üìä COMPARA√á√ÉO FINAL

In [None]:
import subprocess
import re
import pandas as pd

def extract_time(cmd):
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    match = re.search(r'Tempo.*?:\s*([\d.]+)\s*ms', result.stdout)
    return float(match.group(1)) if match else None

print("="*70)
print("RESUMO COMPARATIVO - Dataset GRANDE (1M pontos, K=16)")
print("="*70 + "\n")

results = []

# CUDA Puro
print("Testando CUDA...")
time_cuda = extract_time("cuda/kmeans_1d_cuda data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6 256")
results.append(('CUDA (256 TPB)', time_cuda))

# OpenMP + CUDA
for t in [1, 2, 4, 8]:
    print(f"Testando OpenMP({t}t) + CUDA...")
    time_hybrid = extract_time(
        f"OMP_NUM_THREADS={t} hybrid/kmeans_1d_omp_cuda "
        f"data/dados_grande.csv data/dados_grande_centroides_init.csv "
        f"50 1e-6 {t} 256"
    )
    results.append((f'OpenMP({t}t) + CUDA', time_hybrid))

# OpenMP + MPI (CPU apenas)
print("Testando OpenMP(2t) + MPI(1p)...")
time_hybrid = extract_time(
    "OMP_NUM_THREADS=2 mpirun -np 1 --allow-run-as-root --oversubscribe "
    "hybrid/kmeans_1d_omp_mpi "
    "data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6"
)
results.append(('OpenMP(2t) + MPI(1p)', time_hybrid))

print("Testando OpenMP(1t) + MPI(2p)...")
time_hybrid = extract_time(
    "OMP_NUM_THREADS=1 mpirun -np 2 --allow-run-as-root --oversubscribe "
    "hybrid/kmeans_1d_omp_mpi "
    "data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6"
)
results.append(('OpenMP(1t) + MPI(2p)', time_hybrid))

print("Testando OpenMP(2t) + MPI(2p)...")
time_hybrid = extract_time(
    "OMP_NUM_THREADS=2 mpirun -np 2 --allow-run-as-root --oversubscribe "
    "hybrid/kmeans_1d_omp_mpi "
    "data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6"
)
results.append(('OpenMP(2t) + MPI(2p)', time_hybrid))

# MPI + CUDA
for p in [1, 2, 4]:
    print(f"Testando MPI({p}p) + CUDA...")
    time_hybrid = extract_time(
        f"mpirun -np {p} --allow-run-as-root --oversubscribe "
        f"hybrid/kmeans_1d_mpi_cuda "
        f"data/dados_grande.csv data/dados_grande_centroides_init.csv 50 1e-6 256"
    )
    results.append((f'MPI({p}p) + CUDA', time_hybrid))

# Criar tabela
df = pd.DataFrame(results, columns=['Implementa√ß√£o', 'Tempo (ms)'])
df = df[df['Tempo (ms)'].notna()]

print("\n" + "="*70)
print(df.to_string(index=False))
print("="*70)

# Encontrar o mais r√°pido
if not df.empty:
    fastest = df.loc[df['Tempo (ms)'].idxmin()]
    print(f"\nüèÜ Mais r√°pido: {fastest['Implementa√ß√£o']} com {fastest['Tempo (ms)']:.2f} ms")

---

## ‚úÖ Benchmark Completo!

Todos os testes foram executados com:
- ‚úÖ CUDA puro (GPU)
- ‚úÖ OpenMP + CUDA (CPU multi-thread + GPU)
- ‚úÖ MPI + CUDA (Distribu√≠do + GPU)

**Pr√≥ximos passos:**
1. Analisar os resultados
2. Comparar com resultados do Windows (Serial, OpenMP, MPI)
3. Gerar gr√°ficos de speedup
4. Documentar no relat√≥rio