#### **Tratamento de Dados `Visibilímetro`**

> Port of Santos Center of Excellence
- NebulaPredictor


In [3]:
from tqdm import tqdm
import pandas as pd
import PyPDF2
import os

In [12]:
pdf_files = [f for f in os.listdir('raw') if f.endswith('.pdf')]

cpsp_files = [f for f in pdf_files if 'CPSP' in f]
palmas_files = [f for f in pdf_files if 'PALMAS' in f]
brasiltp_files = [f for f in pdf_files if 'BTP' in f]
praticagem_files = [f for f in pdf_files if 'PRATICAGEM' in f]

In [13]:
def process_pdf_files(files):
    all_data = []
    for pdf_file in files:
        with open(os.path.join('raw', pdf_file), 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            content = ""
            for page in tqdm(pdf_reader.pages, desc=f"R :: {pdf_file}"): content += page.extract_text()
        
        rows = content.strip().split('\n')[1:]

        for row in tqdm(rows, desc=f"P :: {pdf_file}"):
            date_time, visibility = row.rsplit(' ', 1)
            date_time = date_time.strip()
            if len(date_time) > 19: date_time=date_time[:19]
            try: visibility_value = float(visibility)
            except ValueError: visibility_value = float('nan')
            all_data.append([date_time, visibility_value])
            
    return all_data

In [14]:
def create_df(data):
    df = pd.DataFrame(data, columns=['DateTime', 'Visibilidade_m'])
    df['DateTime'] = pd.to_datetime(df['DateTime'], format='mixed', dayfirst=True)
    
    df['Ano'] = df['DateTime'].dt.year
    df['Mes'] = df['DateTime'].dt.month 
    df['Dia'] = df['DateTime'].dt.day
    df['Hora'] = df['DateTime'].dt.hour
    df['Minuto'] = df['DateTime'].dt.minute
    df['Segundo'] = df['DateTime'].dt.second
    
    return df[['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m']]

---

In [15]:
def process_pdf_files(files):
    all_data = []
    for pdf_file in files:
        with open(os.path.join('raw', pdf_file), 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            content = ""
            for page in tqdm(pdf_reader.pages, desc=f"Reading {pdf_file}"): 
                content += page.extract_text()
        
        rows = content.strip().split('\n')[1:]

        for row in tqdm(rows, desc=f"Processing {pdf_file}"):
            try: # Validar se a linha tem pelo menos um espaço
                if ' ' not in row: continue
                    
                date_time, visibility = row.rsplit(' ', 1)
                date_time = date_time.strip()
                
                # Validar formato da data (deve ter pelo menos 10 caracteres: DD/MM/YYYY)
                if len(date_time) < 10 or date_time.lower() == 'm': continue
                    
                if len(date_time) > 19: date_time=date_time[:19]
                
                try: visibility_value=float(visibility)
                except ValueError: visibility_value=float('nan')
                    
                all_data.append([date_time, visibility_value])
            
            except Exception: continue
    return all_data

In [16]:
def create_df(data):
    if not data: return pd.DataFrame(columns=['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m'])
        
    df = pd.DataFrame(data, columns=['DateTime', 'Visibilidade_m'])
    df['DateTime'] = pd.to_datetime(df['DateTime'], format='mixed', dayfirst=True, errors='coerce')
    df = df.dropna(subset=['DateTime'])
    
    df['Ano'] = df['DateTime'].dt.year
    df['Mes'] = df['DateTime'].dt.month 
    df['Dia'] = df['DateTime'].dt.day
    df['Hora'] = df['DateTime'].dt.hour
    df['Minuto'] = df['DateTime'].dt.minute
    df['Segundo'] = df['DateTime'].dt.second
    
    return df[['DateTime', 'Ano', 'Mes', 'Dia', 'Hora', 'Minuto', 'Segundo', 'Visibilidade_m']]

In [82]:
df_cpsp = create_df(process_pdf_files(cpsp_files))

Reading VISIBIL_CPSP_012024.pdf: 100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
Processing VISIBIL_CPSP_012024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 91628.71it/s]
Reading VISIBIL_CPSP_022024.pdf: 100%|██████████| 100/100 [00:15<00:00,  6.45it/s]
Processing VISIBIL_CPSP_022024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 17633.17it/s]
Reading VISIBIL_CPSP_032024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
Processing VISIBIL_CPSP_032024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 120259.82it/s]
Reading VISIBIL_CPSP_042024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
Processing VISIBIL_CPSP_042024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 147852.99it/s]
Reading VISIBIL_CPSP_052024.pdf: 100%|██████████| 100/100 [00:12<00:00,  7.91it/s]
Processing VISIBIL_CPSP_052024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 116790.15it/s]
Reading VISIBIL_CPSP_062024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.12it/s]
Processing VISIBIL_CPSP_062024.pdf: 100%|███

In [83]:
df_palmas = create_df(process_pdf_files(palmas_files))

Reading VISIBIL_PALMAS_062024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.24it/s]
Processing VISIBIL_PALMAS_062024.pdf: 100%|██████████| 11298/11298 [00:00<00:00, 138363.79it/s]
Reading VISIBIL_PALMAS_072024.pdf: 100%|██████████| 100/100 [00:26<00:00,  3.81it/s]
Processing VISIBIL_PALMAS_072024.pdf: 100%|██████████| 10998/10998 [00:00<00:00, 244856.23it/s]
Reading VISIBIL_PALMAS_082024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.32it/s]
Processing VISIBIL_PALMAS_082024.pdf: 100%|██████████| 11097/11097 [00:00<00:00, 98658.65it/s]
Reading VISIBIL_PALMAS_092024.pdf: 100%|██████████| 100/100 [00:20<00:00,  4.76it/s]
Processing VISIBIL_PALMAS_092024.pdf: 100%|██████████| 9996/9996 [00:00<00:00, 108590.54it/s]
Reading VISIBIL_PALMAS_102024.pdf: 100%|██████████| 100/100 [00:33<00:00,  2.99it/s]
Processing VISIBIL_PALMAS_102024.pdf: 100%|██████████| 11299/11299 [00:00<00:00, 118991.16it/s]


In [18]:
df_brasiltp = create_df(process_pdf_files(brasiltp_files))

Reading VISIBIL_BTP_012024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.95it/s]
Processing VISIBIL_BTP_012024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 91624.70it/s]
Reading VISIBIL_BTP_022024.pdf: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]
Processing VISIBIL_BTP_022024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 545825.36it/s]
Reading VISIBIL_BTP_032024.pdf: 100%|██████████| 100/100 [00:08<00:00, 11.34it/s]
Processing VISIBIL_BTP_032024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 563936.71it/s]
Reading VISIBIL_BTP_042024.pdf: 100%|██████████| 100/100 [00:06<00:00, 14.56it/s]
Processing VISIBIL_BTP_042024.pdf: 100%|██████████| 5700/5700 [00:00<00:00, 195645.86it/s]
Reading VISIBIL_BTP_052024.pdf: 100%|██████████| 100/100 [00:11<00:00,  8.56it/s]
Processing VISIBIL_BTP_052024.pdf: 100%|██████████| 6000/6000 [00:00<00:00, 386530.23it/s]
Reading VISIBIL_BTP_062024.pdf: 100%|██████████| 100/100 [00:10<00:00,  9.98it/s]
Processing VISIBIL_BTP_062024.pdf: 100%|██████████| 60

In [84]:
df_praticagem = create_df(process_pdf_files(praticagem_files))

Reading VISIBIL_PRATICAGEM_012024.pdf: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]
Processing VISIBIL_PRATICAGEM_012024.pdf: 100%|██████████| 11099/11099 [00:00<00:00, 277647.38it/s]
Reading VISIBIL_PRATICAGEM_022024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]
Processing VISIBIL_PRATICAGEM_022024.pdf: 100%|██████████| 12198/12198 [00:00<00:00, 358974.48it/s]
Reading VISIBIL_PRATICAGEM_032024.pdf: 100%|██████████| 100/100 [00:22<00:00,  4.45it/s]
Processing VISIBIL_PRATICAGEM_032024.pdf: 100%|██████████| 11699/11699 [00:00<00:00, 292761.46it/s]
Reading VISIBIL_PRATICAGEM_042024.pdf: 100%|██████████| 100/100 [00:23<00:00,  4.19it/s]
Processing VISIBIL_PRATICAGEM_042024.pdf: 100%|██████████| 11899/11899 [00:00<00:00, 360813.06it/s]
Reading VISIBIL_PRATICAGEM_052024.pdf: 100%|██████████| 100/100 [00:16<00:00,  5.89it/s]
Processing VISIBIL_PRATICAGEM_052024.pdf: 100%|██████████| 10899/10899 [00:00<00:00, 340694.59it/s]
Reading VISIBIL_PRATICAGEM_062024.pdf: 100%|██████████|

In [19]:
df_cpsp.to_csv(r'data\Visibil CPSP 01-10 (2024).csv', index=False, encoding='UTF-8')
df_cpsp.to_excel(r'data\Visibil CPSP 01-10 (2024).xlsx', index=False, engine='openpyxl')

df_palmas.to_csv(r'data\Visibil PALM 06-10 (2024).csv', index=False, encoding='UTF-8')
df_palmas.to_excel(r'data\Visibil PALM 06-10 (2024).xlsx', index=False, engine='openpyxl')

df_praticagem.to_csv(r'data\Visibil PRAT 01-10 (2024).csv', index=False, encoding='UTF-8')
df_praticagem.to_excel(r'data\Visibil PRAT 01-10 (2024).xlsx', index=False, engine='openpyxl')

df_brasiltp.to_csv(r'data\Visibil BRTP 01-10 (2024).csv', index=False, encoding='UTF-8')
df_brasiltp.to_excel(r'data\Visibil BRTP 01-10 (2024).xlsx', index=False, engine='openpyxl')

In [53]:
df_brasiltp.describe()

Unnamed: 0,DateTime,Ano,Mes,Dia,Hora,Minuto,Segundo,Visibilidade_m
count,58500,58500.0,58500.0,58500.0,58500.0,58500.0,58500.0,58500.0
mean,2024-05-16 23:27:21.959487232,2024.0,5.471795,1.548444,11.152855,29.415453,15.001538,1998.14776
min,2024-01-01 00:00:00,2024.0,1.0,1.0,0.0,0.0,0.0,1997.8
25%,2024-03-01 22:15:22.500000,2024.0,3.0,1.0,5.0,14.0,0.0,1998.08
50%,2024-05-03 01:36:15,2024.0,5.0,2.0,11.0,29.0,30.0,1998.14
75%,2024-08-01 23:30:07.500000,2024.0,8.0,2.0,17.0,44.0,30.0,1998.2
max,2024-10-03 00:22:30,2024.0,10.0,3.0,23.0,59.0,30.0,1999.02
std,,0.0,2.870381,0.559059,7.064304,17.301114,15.000128,0.123013


In [52]:
df_brasiltp.iloc[55500:55560, :]

Unnamed: 0,DateTime,Ano,Mes,Dia,Hora,Minuto,Segundo,Visibilidade_m
55500,2024-10-01 22:53:30,2024,10,1,22,53,30,1998.14
55501,2024-10-01 22:54:00,2024,10,1,22,54,0,1998.11
55502,2024-10-01 22:54:30,2024,10,1,22,54,30,1998.11
55503,2024-10-01 22:55:00,2024,10,1,22,55,0,1998.05
55504,2024-10-01 22:55:30,2024,10,1,22,55,30,1998.08
55505,2024-10-01 22:56:00,2024,10,1,22,56,0,1998.11
55506,2024-10-01 22:56:30,2024,10,1,22,56,30,1998.08
55507,2024-10-01 22:57:00,2024,10,1,22,57,0,1998.08
55508,2024-10-01 22:57:30,2024,10,1,22,57,30,1998.14
55509,2024-10-01 22:58:00,2024,10,1,22,58,0,1998.11
