In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [20]:
def fetch_ogimet_data(station_id, start_date, end_date):
    all_data = []
    diagnostics = []
    headers = []

    current_date = start_date
    while current_date < end_date:
        next_date = current_date + timedelta(days=50)
        if next_date > end_date:
            next_date = end_date
        
        url = f'https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind={station_id}&ndays={(next_date - current_date).days}&ano={current_date.year}&mes={current_date.month:02d}&day={current_date.day:02d}&hora=13&ord=REV&Send=Send'
        response = requests.get(url)
        
        print(f"URL: {url}")
        print(f"Response Status Code: {response.status_code}")
        
        if response.status_code != 200:
            diagnostics.append(f"Failed to fetch data for the period starting {current_date}. Status code: {response.status_code}")
            current_date = next_date
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table')
        table_found = False
        
        for i, table in enumerate(tables):
            rows = table.find_all('tr')
            if not rows:
                continue
            
            print(f"Checking table {i}")
            
            if not headers and rows[0].find_all('th'):
                headers = [th.text.strip() for th in rows[0].find_all('th')]
                print(f"Headers: {headers}")

            if 'Date' in headers:
                table_found = True
                for row in rows[1:]:
                    cols = row.find_all('td')
                    data_row = [col.text.strip() for col in cols]
                    # Filter out rows that do not match the header length
                    if len(data_row) >= len(headers):
                        all_data.append(data_row[:len(headers)])
                        print(f"Data row added: {data_row[:len(headers)]}")
                    else:
                        print(f"Data row length mismatch: {len(data_row)} vs {len(headers)}. Data row: {data_row}")
                diagnostics.append(f"Data fetched for the period starting {current_date}")
                break
        if not table_found:
            diagnostics.append(f"No data found for the period starting {current_date}")
        
        current_date = next_date
    
    if all_data:
        df = pd.DataFrame(all_data, columns=headers)
    else:
        df = pd.DataFrame(columns=headers)
    
    return df, diagnostics

In [21]:
def main():
    station_id = '40270'
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 3, 1)

    print(f"Starting data fetch for station {station_id} from {start_date} to {end_date}")
    df, diagnostics = fetch_ogimet_data(station_id, start_date, end_date)
    df.to_csv('ogimet_data_long_table_test.csv', index=False)
    
    print("Diagnostics:")
    for message in diagnostics:
        print(message)
    
    print("Data saved to ogimet_data_long_table_test.csv")


In [22]:
# Run the main function
main()

Starting data fetch for station 40270 from 2023-01-01 00:00:00 to 2023-03-01 00:00:00
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40270&ndays=50&ano=2023&mes=01&day=01&hora=13&ord=REV&Send=Send
Response Status Code: 200
Checking table 0
Checking table 1
Checking table 2
Headers: ['Date', 'Temperature(C)', 'TdAvg(C)', 'Hr.Avg(%)', 'Wind(km/h)', 'Pres.e.lev(Hp)', 'Prec.(mm)', 'TotClOct', 'lowClOct', 'VisKm', 'Dailyweather summary']
Data row length mismatch: 0 vs 11. Data row: []
Data row added: ['01/01', '9.3', '7.8', '8.3', '5.8', '84.1', 'E', '10.4', '935.8', '----', '7.2']
Data row added: ['12/31', '12.7', '6.0', '9.0', '3.7', '70.0', 'E', '10.4', '935.2', '0.0', '3.8']
Data row added: ['12/30', '12.4', '3.8', '8.0', '4.3', '78.7', 'ENE', '11.1', '935.3', '0.0', '1.9']
Data row added: ['12/29', '13.5', '4.2', '7.6', '5.4', '86.9', 'ENE', '2.1', '935.4', '0.0', '2.6']
Data row added: ['12/28', '13.0', '6.7', '9.6', '5.7', '77.3', 'NW', '3.5', '935.4', '0.0', '3.8']
Data row

In [23]:
import pandas as pd

# Load the data from the CSV file
file_path = 'ogimet_data_long_table_test.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())

# Display the column names and the number of rows
print("Columns:", df.columns)
print("Number of rows:", len(df))

    Date Temperature(C) TdAvg(C) Hr.Avg(%)  Wind(km/h)  Pres.e.lev(Hp)  \
0  01/01            9.3      7.8       8.3         5.8            84.1   
1  12/31           12.7      6.0       9.0         3.7            70.0   
2  12/30           12.4      3.8       8.0         4.3            78.7   
3  12/29           13.5      4.2       7.6         5.4            86.9   
4  12/28           13.0      6.7       9.6         5.7            77.3   

  Prec.(mm)  TotClOct  lowClOct VisKm  Dailyweather summary  
0         E      10.4     935.8  ----                   7.2  
1         E      10.4     935.2   0.0                   3.8  
2       ENE      11.1     935.3   0.0                   1.9  
3       ENE       2.1     935.4   0.0                   2.6  
4        NW       3.5     935.4   0.0                   3.8  
Columns: Index(['Date', 'Temperature(C)', 'TdAvg(C)', 'Hr.Avg(%)', 'Wind(km/h)',
       'Pres.e.lev(Hp)', 'Prec.(mm)', 'TotClOct', 'lowClOct', 'VisKm',
       'Dailyweather summary'],

In [23]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def fetch_ogimet_data(station_id, start_date, end_date, filename):
    url_template = "https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind={}&ndays={}&ano={}&mes={}&day={}&hora=13&ord=REV&Send=Send"
    all_data = []
    current_date = start_date

    required_columns = ['Date', 'Temperature(C)', 'TdAvg(C)', 'Hr.Avg(%)', 'Wind(km/h)', 'Pres.e.lev(Hp)', 'Prec.(mm)', 'TotClOct', 'lowClOct', 'VisKm']
    
    while current_date < end_date:
        next_date = min(current_date + timedelta(days=50), end_date)
        url = url_template.format(station_id, (next_date - current_date).days, current_date.year, current_date.month, current_date.day)
        
        print(f"URL: {url}")
        response = requests.get(url)
        print(f"Response Status Code: {response.status_code}")
        
        if response.status_code != 200:
            print(f"Failed to fetch data for {current_date}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        tables = soup.find_all('table')

        if len(tables) > 2:
            table = tables[2]  # The table we need
            headers = [th.get_text(strip=True) for th in table.find_all('th') if th.get_text(strip=True)]
            print(f"Headers: {headers}")

            rows = table.find_all('tr')
            for row in rows:
                cols = [td.get_text(strip=True) for td in row.find_all('td')]
                # Filter and only keep the columns we need
                if len(cols) >= len(required_columns):  
                    filtered_row = cols[:len(required_columns)]
                    filtered_row[0] = f"{filtered_row[0]}/{current_date.year}"
                    all_data.append(filtered_row)
        
        current_date = next_date
    
    # Convert to DataFrame
    if all_data:
        df = pd.DataFrame(all_data, columns=required_columns)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save")

# Example usage
start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 1, 1)
station_id = '40341'
file_out = 'ogimet_data_' + station_id+ '_2019_2022.csv'
# fetch_ogimet_data(station_id, start_date, end_date, 'ogimet_data_2019_2022.csv')
fetch_ogimet_data(station_id, start_date, end_date, file_out)


URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2019&mes=1&day=1&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2019&mes=2&day=20&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2019&mes=4&day=11&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2019&mes=5&day=31&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2019&mes=7&day=20&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
U

In [24]:
# Example usage
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 6, 1)
# station_id = 40270
file_out = 'ogimet_data_' + station_id+ '_2023_2024.csv'
fetch_ogimet_data(station_id, start_date, end_date, file_out)
# fetch_ogimet_data(station_id, start_date, end_date, 'ogimet_data_2023_2024.csv')


URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2023&mes=1&day=1&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2023&mes=2&day=20&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2023&mes=4&day=11&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2023&mes=5&day=31&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
URL: https://www.ogimet.com/cgi-bin/gsynres?lang=en&ind=40341&ndays=50&ano=2023&mes=7&day=20&hora=13&ord=REV&Send=Send
Response Status Code: 200
Headers: ['No valid data found in database for 40341']
U