## API to collect weather data

This API collects the specified weather data to accompany the paragliding flight logs and metadata. In case there are errors returned the "Retry" code block needs to be run until all thermal lines are correctly fetched from Open-Meteo.com. 

In [None]:
# 0. Load Libraries
import os
import re
import time
import json
import random
from pathlib import Path
from datetime import timedelta
from time import sleep

import pandas as pd
import numpy as np
import requests
import requests_cache
import openmeteo_requests
from retry_requests import retry
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from bs4 import BeautifulSoup

In [None]:
# 1. Test: API Call for a single weather request 

# Test thermal example metadata
thermal_time = pd.to_datetime("2022-02-13 12:15:24")
thermal_lat = 47.356699
thermal_lon = 9.973421

# API request
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": thermal_lat,
    "longitude": thermal_lon,
    "start_date": thermal_time.strftime("%Y-%m-%d"),
    "end_date": thermal_time.strftime("%Y-%m-%d"),
    "hourly": [
        # Surface
        "temperature_2m",
        "dewpoint_2m",
        "windspeed_10m",
        "winddirection_10m",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "shortwave_radiation",
        
        # Convective / BL
        "boundary_layer_height"
    ]
}

response = requests.get(url, params=params)
data = response.json()

# Build hourly DataFrame
df_weather = pd.DataFrame(data["hourly"])
df_weather["time"] = pd.to_datetime(df_weather["time"])

# Find nearest row to thermal start time
nearest = df_weather.iloc[(df_weather["time"] - thermal_time).abs().argsort().iloc[0]]

# Show first 13 rows and the nearest row separately
display(df_weather.head(14))
print("\nNearest weather record to thermal start time:")
display(nearest)

Unnamed: 0,time,temperature_2m,dewpoint_2m,windspeed_10m,winddirection_10m,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,boundary_layer_height
0,2022-02-13 00:00:00,-7.2,-14.5,3.8,163,0,1,100,0.0,35.0
1,2022-02-13 01:00:00,-6.8,-15.2,3.7,169,0,3,100,0.0,35.0
2,2022-02-13 02:00:00,-6.8,-15.1,3.0,166,0,94,100,0.0,35.0
3,2022-02-13 03:00:00,-6.5,-15.6,1.9,158,0,77,100,0.0,30.0
4,2022-02-13 04:00:00,-6.3,-16.7,1.9,158,0,77,100,0.0,30.0
5,2022-02-13 05:00:00,-6.6,-17.4,2.3,162,0,35,4,0.0,30.0
6,2022-02-13 06:00:00,-7.4,-18.1,1.9,158,0,4,0,0.0,30.0
7,2022-02-13 07:00:00,-7.4,-18.0,1.6,153,0,7,0,10.0,30.0
8,2022-02-13 08:00:00,-6.4,-16.5,1.6,153,0,23,0,127.0,35.0
9,2022-02-13 09:00:00,-0.4,-11.8,2.3,141,0,5,0,288.0,50.0



Nearest weather record to thermal start time:


time                     2022-02-13 12:00:00
temperature_2m                           2.5
dewpoint_2m                             -9.1
windspeed_10m                            2.1
winddirection_10m                        329
cloudcover_low                             0
cloudcover_mid                             0
cloudcover_high                            0
shortwave_radiation                    551.0
boundary_layer_height                  255.0
Name: 12, dtype: object

In [None]:
# 2. Script to load thermals sequentially in batches (runs for several hours)

# 1. LOAD THERMALS
MASTER_PATH = "/Users/moritzknodler/Documents/00_Lectures/0_Fall 2025/ML/z_Project/Flight data/thermals/thermals_master.csv"
OUTPUT_PATH = MASTER_PATH.replace(".csv", "_with_weather.csv")

master = pd.read_csv(MASTER_PATH, parse_dates=["start_datetime"])

print(f"Loaded {len(master)} thermals from {MASTER_PATH}")

# Resume progress if output file already exists
try:
    existing = pd.read_csv(OUTPUT_PATH)
    done_ids = set(existing["flight_id"])
    master = master[~master["flight_id"].isin(done_ids)]
    results = existing.to_dict("records")
    print(f"Resuming: {len(done_ids)} done, {len(master)} remaining")
except FileNotFoundError:
    results = []
    print("Starting fresh")

# 2. FETCH FUNCTION
def fetch_weather(row):
    """Fetch hourly weather for a single thermal."""
    try:
        url = "https://archive-api.open-meteo.com/v1/archive"
        params = {
            "latitude": row["lat_center"],
            "longitude": row["lon_center"],
            "start_date": row["start_datetime"].strftime("%Y-%m-%d"),
            "end_date": row["start_datetime"].strftime("%Y-%m-%d"),
            "hourly": [
                "temperature_2m", "dewpoint_2m", "windspeed_10m",
                "winddirection_10m", "cloudcover_low", "cloudcover_mid",
                "cloudcover_high", "shortwave_radiation", "boundary_layer_height"
            ]
        }

        response = requests.get(url, params=params, timeout=15)
        if response.status_code != 200:
            raise ValueError(f"HTTP {response.status_code}")

        data = response.json()
        df_weather = pd.DataFrame(data["hourly"])
        df_weather["time"] = pd.to_datetime(df_weather["time"])

        # Find nearest time
        nearest = df_weather.iloc[
            (df_weather["time"] - row["start_datetime"]).abs().argsort().iloc[0]
        ]

        enriched = row.to_dict()
        enriched.update(nearest.to_dict())
        enriched["error"] = None
        return enriched

    except Exception as e:
        enriched = row.to_dict()
        enriched["error"] = str(e)
        return enriched


# 3. MAIN LOOP (1000 at a time)
BATCH_LIMIT = 1000
to_process = master.iloc[:BATCH_LIMIT]
print(f"Processing {len(to_process)} thermals in this run (limit {BATCH_LIMIT})...")

for i, row in tqdm(to_process.iterrows(), total=len(to_process), desc="üå§ Fetching weather"):
    results.append(fetch_weather(row))

    # Save progress every 10 rows
    if (i + 1) % 10 == 0 or (i + 1) == len(to_process):
        pd.DataFrame(results).to_csv(OUTPUT_PATH, index=False)
        print(f"Saved after {i+1} rows")

    # Tiny sleep to stay under rate limit (~10 req/sec = safe)
    time.sleep(random.uniform(0.05, 0.15))


# 4. FINAL SAVE
df = pd.DataFrame(results)
df.to_csv(OUTPUT_PATH, index=False)
print(f"\nFinished. Saved {len(df)} rows to {OUTPUT_PATH}")
print(f"Errors: {df['error'].notna().sum()} rows.")


Loaded 142 thermals from /Users/moritzknodler/Documents/00_Lectures/0_Fall 2025/ML/z_Project/Flight data/thermals/thermals_missing.csv
Starting fresh
Processing 142 thermals in this run (limit 1000)...


üå§ Fetching weather:   7%|‚ñã         | 10/142 [00:08<01:40,  1.32it/s]

Saved after 10 rows


üå§ Fetching weather:  14%|‚ñà‚ñç        | 20/142 [00:15<01:30,  1.35it/s]

Saved after 20 rows


üå§ Fetching weather:  21%|‚ñà‚ñà        | 30/142 [00:23<01:24,  1.32it/s]

Saved after 30 rows


üå§ Fetching weather:  28%|‚ñà‚ñà‚ñä       | 40/142 [00:31<01:17,  1.31it/s]

Saved after 40 rows


üå§ Fetching weather:  35%|‚ñà‚ñà‚ñà‚ñå      | 50/142 [00:39<01:09,  1.33it/s]

Saved after 50 rows


üå§ Fetching weather:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 60/142 [00:46<01:01,  1.32it/s]

Saved after 60 rows


üå§ Fetching weather:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 70/142 [00:54<00:57,  1.25it/s]

Saved after 70 rows


üå§ Fetching weather:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 80/142 [01:02<00:44,  1.39it/s]

Saved after 80 rows


üå§ Fetching weather:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 90/142 [01:10<00:40,  1.29it/s]

Saved after 90 rows


üå§ Fetching weather:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 100/142 [01:17<00:31,  1.33it/s]

Saved after 100 rows


üå§ Fetching weather:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 110/142 [01:26<00:25,  1.25it/s]

Saved after 110 rows


üå§ Fetching weather:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 120/142 [01:33<00:16,  1.32it/s]

Saved after 120 rows


üå§ Fetching weather:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 130/142 [01:41<00:10,  1.17it/s]

Saved after 130 rows


üå§ Fetching weather:  99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 140/142 [01:49<00:01,  1.38it/s]

Saved after 140 rows


üå§ Fetching weather: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 142/142 [01:50<00:00,  1.28it/s]

Saved after 142 rows

‚úÖ Finished. Saved 142 rows to /Users/moritzknodler/Documents/00_Lectures/0_Fall 2025/ML/z_Project/Flight data/thermals/thermals_missing_with_weather.csv
‚ö†Ô∏è Errors: 0 rows.





In [None]:
# 3. Check for missing rows

base = "/Users/moritzknodler/Library/Mobile Documents/com~apple~CloudDocs/Documents/00_Lectures/0_Fall 2025/ML/z_Project/Flight data/thermals"

# File paths
file1 = os.path.join(base, "thermals_master.csv")
file2 = os.path.join(base, "thermals_master_with_weather.csv")

# Load
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

print("üìä Basic Info")
print(f"thermals_master.csv ‚Üí {len(df1):,} rows, {df1.shape[1]} columns")
print(f"thermals_master_with_weather.csv ‚Üí {len(df2):,} rows, {df2.shape[1]} columns\n")

# Check column overlap
print("üß© Columns only in one file:")
print("‚Üí Only in master:", set(df1.columns) - set(df2.columns))
print("‚Üí Only in master_with_weather:", set(df2.columns) - set(df1.columns), "\n")

# Identify overlap based on (lon, lat, time)
if all(col in df1.columns for col in ['lon', 'lat', 'time']) and all(col in df2.columns for col in ['lon', 'lat', 'time']):
    key_cols = ['lon', 'lat', 'time']
    merged = df1.merge(df2, on=key_cols, how='inner')
    overlap_pct = len(merged) / len(df1) * 100
    print(f"üîç Overlapping thermals (based on lon, lat, time): {len(merged):,} ({overlap_pct:.1f}% of master)\n")

# Missing values summary
print("üìâ Missing values (thermals_master_with_weather.csv):")
missing = df2.isna().sum()
print(missing[missing > 0].sort_values(ascending=False).head(20))

# Missing values check
# Count NaNs per column and sort descending
nan_counts = df2.isna().sum().sort_values(ascending=False)

# Display as a nice table
print("üîç Missing values per column:")
display(nan_counts.to_frame("NaN_Count"))

üìä Basic Info
thermals_master.csv ‚Üí 7,883 rows, 17 columns
thermals_master_with_weather.csv ‚Üí 7,883 rows, 28 columns

üß© Columns only in one file:
‚Üí Only in master: set()
‚Üí Only in master_with_weather: {'temperature_2m', 'boundary_layer_height', 'cloudcover_low', 'winddirection_10m', 'cloudcover_mid', 'cloudcover_high', 'time', 'windspeed_10m', 'thermal_id', 'dewpoint_2m', 'shortwave_radiation'} 

üìâ Missing values (thermals_master_with_weather.csv):
Series([], dtype: int64)
üîç Missing values per column:


Unnamed: 0,NaN_Count
thermal_id,0
flight_id,0
shortwave_radiation,0
cloudcover_high,0
cloudcover_mid,0
cloudcover_low,0
winddirection_10m,0
windspeed_10m,0
dewpoint_2m,0
temperature_2m,0


In [None]:
# 4. Clean, sort, re-index and save data file

# Drop 'error' column if it exists
if "error" in df.columns:
    df = df.drop(columns=["error"])

# Fill missing boundary layer heights with 0 (warning-safe version)
df["boundary_layer_height"] = df["boundary_layer_height"].fillna(0)

# Sort thermals by start_datetime
df = df.sort_values(by="start_datetime").reset_index(drop=True)

# Add unique thermal_id at the beginning of the DataFrame
df.insert(0, "thermal_id", range(1, len(df) + 1))

# Display top rows with all columns
pd.set_option("display.max_columns", None)
display(df.head())

# Save to CSV
df.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Sorted and saved {len(df)} thermals to {OUTPUT_PATH}")

Unnamed: 0,thermal_id,flight_id,start_datetime,start_time,end_time,duration_s,avg_climb,max_climb,climb_std,avg_turn_radius,avg_turn,turn_std,lat_center,lon_center,entry_alt,exit_alt,alt_gain,thermal_quality,time,temperature_2m,dewpoint_2m,windspeed_10m,winddirection_10m,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,boundary_layer_height
0,1,3007255,2022-02-13 12:15:01,2735,3030,251,1.445575,2.6,0.631151,450955.312477,6.04778,6.973388,47.399467,9.940124,1668,2044,362.0,117.236782,2022-02-13 12:00:00,1.8,-9.1,3.7,349.0,0.0,0.0,0.0,545.0,355.0
1,2,3007255,2022-02-13 12:21:20,3114,3147,23,0.823119,1.6,0.354334,29.269215,22.050214,4.545925,47.398643,9.951039,1973,1996,20.0,13.788889,2022-02-13 12:00:00,0.9,-10.0,3.7,349.0,0.0,0.0,0.0,545.0,355.0
2,3,3007255,2022-02-13 12:25:37,3371,3463,67,1.743646,5.2,1.185881,23.452654,17.5303,6.111728,47.383997,9.965077,1802,1936,121.0,90.514286,2022-02-13 12:00:00,2.0,-8.9,3.7,349.0,0.0,0.0,0.0,545.0,255.0
3,4,3007255,2022-02-13 12:31:56,3750,3866,99,1.078976,2.4,0.563373,24.250999,10.943775,6.024201,47.356863,9.972804,1592,1701,105.0,90.455,2022-02-13 13:00:00,2.6,-11.0,3.3,6.0,0.0,0.0,0.0,522.0,250.0
4,5,3007255,2022-02-13 12:34:43,3917,4072,99,0.863319,1.0,0.307944,-36.463323,-17.01264,4.143534,47.356699,9.973421,1675,1751,86.0,7.818182,2022-02-13 13:00:00,2.6,-11.0,3.3,6.0,0.0,0.0,0.0,522.0,250.0


‚úÖ Sorted and saved 7883 thermals to /Users/moritzknodler/Documents/00_Lectures/0_Fall 2025/ML/z_Project/Flight data/thermals/thermals_master_with_weather.csv
