#### **Tratamento de Dados `Visibilímetro`**

> Port of Santos Center of Excellence
- NebulaPredictor


In [76]:
from tqdm import tqdm
import pandas as pd
import PyPDF2
import os

In [77]:
pdf_files = [f for f in os.listdir('raw') if f.endswith('.pdf')]

cpsp_files = [f for f in pdf_files if 'CPSP' in f]
palmas_files = [f for f in pdf_files if 'PALMAS' in f]
praticagem_files = [f for f in pdf_files if 'PRATICAGEM' in f]

In [78]:
def process_pdf_files(files):
    all_data = []
    for pdf_file in files:
        with open(os.path.join('raw', pdf_file), 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            content = ""
            for page in tqdm(pdf_reader.pages, desc=f"R :: {pdf_file}"): content += page.extract_text()
        
        rows = content.strip().split('\n')[1:]

        for row in tqdm(rows, desc=f"P :: {pdf_file}"):
            date_time, visibility = row.rsplit(' ', 1)
            date_time = date_time.strip()
            if len(date_time) > 19: date_time=date_time[:19]
            try: visibility_value = float(visibility)
            except ValueError: visibility_value = float('nan')
            all_data.append([date_time, visibility_value])
            
    return all_data

In [79]:
def create_df(data):
    df = pd.DataFrame(data, columns=['DateTime', 'Visibilidade_m'])
    df['DateTime'] = pd.to_datetime(df['DateTime'], format='mixed', dayfirst=True)
    
    df['Ano'] = df['DateTime'].dt.year
    df['Mes'] = df['DateTime'].dt.month 
    df['Dia'] = df['DateTime'].dt.day
    df['Hora'] = df['DateTime'].dt.hour
    df['Minuto'] = df['DateTime'].dt.minute
    df['Segundo'] = df['DateTime'].dt.second
    
    return df[['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m']]

---

In [80]:
def process_pdf_files(files):
    all_data = []
    for pdf_file in files:
        with open(os.path.join('raw', pdf_file), 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            content = ""
            for page in tqdm(pdf_reader.pages, desc=f"Reading {pdf_file}"): 
                content += page.extract_text()
        
        rows = content.strip().split('\n')[1:]

        for row in tqdm(rows, desc=f"Processing {pdf_file}"):
            try: # Validar se a linha tem pelo menos um espaço
                if ' ' not in row: continue
                    
                date_time, visibility = row.rsplit(' ', 1)
                date_time = date_time.strip()
                
                # Validar formato da data (deve ter pelo menos 10 caracteres: DD/MM/YYYY)
                if len(date_time) < 10 or date_time.lower() == 'm': continue
                    
                if len(date_time) > 19: date_time=date_time[:19]
                
                try: visibility_value=float(visibility)
                except ValueError: visibility_value=float('nan')
                    
                all_data.append([date_time, visibility_value])
            
            except Exception: continue
    return all_data

In [81]:
def create_df(data):
    if not data: return pd.DataFrame(columns=['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m'])
        
    df = pd.DataFrame(data, columns=['DateTime', 'Visibilidade_m'])
    df['DateTime'] = pd.to_datetime(df['DateTime'], format='mixed', dayfirst=True, errors='coerce')
    df = df.dropna(subset=['DateTime'])
    
    df['Ano'] = df['DateTime'].dt.year
    df['Mes'] = df['DateTime'].dt.month 
    df['Dia'] = df['DateTime'].dt.day
    df['Hora'] = df['DateTime'].dt.hour
    df['Minuto'] = df['DateTime'].dt.minute
    df['Segundo'] = df['DateTime'].dt.second
    
    return df[['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m']]

In [82]:
df_cpsp = create_df(process_pdf_files(cpsp_files))

Reading VISIBIL_CPSP_012024.pdf: 100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
Processing VISIBIL_CPSP_012024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 91628.71it/s]
Reading VISIBIL_CPSP_022024.pdf: 100%|██████████| 100/100 [00:15<00:00,  6.45it/s]
Processing VISIBIL_CPSP_022024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 17633.17it/s]
Reading VISIBIL_CPSP_032024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
Processing VISIBIL_CPSP_032024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 120259.82it/s]
Reading VISIBIL_CPSP_042024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
Processing VISIBIL_CPSP_042024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 147852.99it/s]
Reading VISIBIL_CPSP_052024.pdf: 100%|██████████| 100/100 [00:12<00:00,  7.91it/s]
Processing VISIBIL_CPSP_052024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 116790.15it/s]
Reading VISIBIL_CPSP_062024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.12it/s]
Processing VISIBIL_CPSP_062024.pdf: 100%|███

In [83]:
df_palmas = create_df(process_pdf_files(palmas_files))

Reading VISIBIL_PALMAS_062024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.24it/s]
Processing VISIBIL_PALMAS_062024.pdf: 100%|██████████| 11298/11298 [00:00<00:00, 138363.79it/s]
Reading VISIBIL_PALMAS_072024.pdf: 100%|██████████| 100/100 [00:26<00:00,  3.81it/s]
Processing VISIBIL_PALMAS_072024.pdf: 100%|██████████| 10998/10998 [00:00<00:00, 244856.23it/s]
Reading VISIBIL_PALMAS_082024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.32it/s]
Processing VISIBIL_PALMAS_082024.pdf: 100%|██████████| 11097/11097 [00:00<00:00, 98658.65it/s]
Reading VISIBIL_PALMAS_092024.pdf: 100%|██████████| 100/100 [00:20<00:00,  4.76it/s]
Processing VISIBIL_PALMAS_092024.pdf: 100%|██████████| 9996/9996 [00:00<00:00, 108590.54it/s]
Reading VISIBIL_PALMAS_102024.pdf: 100%|██████████| 100/100 [00:33<00:00,  2.99it/s]
Processing VISIBIL_PALMAS_102024.pdf: 100%|██████████| 11299/11299 [00:00<00:00, 118991.16it/s]


In [84]:
df_praticagem = create_df(process_pdf_files(praticagem_files))

Reading VISIBIL_PRATICAGEM_012024.pdf: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
Processing VISIBIL_PRATICAGEM_012024.pdf: 100%|██████████| 11099/11099 [00:00<00:00, 277647.38it/s]
Reading VISIBIL_PRATICAGEM_022024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]
Processing VISIBIL_PRATICAGEM_022024.pdf: 100%|██████████| 12198/12198 [00:00<00:00, 358974.48it/s]
Reading VISIBIL_PRATICAGEM_032024.pdf: 100%|██████████| 100/100 [00:22<00:00,  4.45it/s]
Processing VISIBIL_PRATICAGEM_032024.pdf: 100%|██████████| 11699/11699 [00:00<00:00, 292761.46it/s]
Reading VISIBIL_PRATICAGEM_042024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.19it/s]
Processing VISIBIL_PRATICAGEM_042024.pdf: 100%|██████████| 11899/11899 [00:00<00:00, 360813.06it/s]
Reading VISIBIL_PRATICAGEM_052024.pdf: 100%|██████████| 100/100 [00:16<00:00,  5.89it/s]
Processing VISIBIL_PRATICAGEM_052024.pdf: 100%|██████████| 10899/10899 [00:00<00:00, 340694.59it/s]
Reading VISIBIL_PRATICAGEM_062024.pdf: 100%|██████████|

In [85]:
df_cpsp.to_csv(r'data\Visibil CPSP 01-10 (2024).csv', index=False, encoding='UTF-8')
df_cpsp.to_excel(r'data\Visibil CPSP 01-10 (2024).xlsx', index=False, engine='openpyxl')

df_palmas.to_csv(r'data\Visibil PALM 06-10 (2024).csv', index=False, encoding='UTF-8')
df_palmas.to_excel(r'data\Visibil PALM 06-10 (2024).xlsx', index=False, engine='openpyxl')

df_praticagem.to_csv(r'data\Visibil PRAT 01-10 (2024).csv', index=False, encoding='UTF-8')
df_praticagem.to_excel(r'data\Visibil PRAT 01-10 (2024).xlsx', index=False, engine='openpyxl')

In [87]:
df_praticagem

Unnamed: 0,DateTime,Ano,Mes,Dia,Hora,Minuto,Segundo,Visibilidade_m
0,2024-01-01 00:00:00,2024,1,1,0,0,0,2000.0
1,2024-01-01 00:01:00,2024,1,1,0,1,0,2000.0
2,2024-01-01 00:02:00,2024,1,1,0,2,0,2000.0
3,2024-01-01 00:03:00,2024,1,1,0,3,0,2000.0
4,2024-01-01 00:04:00,2024,1,1,0,4,0,2000.0
...,...,...,...,...,...,...,...,...
115475,2024-10-09 04:56:00,2024,10,9,4,56,0,2000.0
115476,2024-10-09 04:57:00,2024,10,9,4,57,0,2000.0
115477,2024-10-09 04:58:00,2024,10,9,4,58,0,2000.0
115478,2024-10-09 04:59:00,2024,10,9,4,59,0,2000.0
