## Gathering data for temperature

In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Your API key
api_key = 'fa467fc7-9a8e-4698-ad49-e45583a23820'

# Base URL for the DMI API
server_url = 'https://dmigw.govcloud.dk/v2/metObs/collections/observation/items?'
headers = {"X-Gravitee-Api-Key": api_key}

# Define the station ID and parameter IDs for temp_dry
station_id = "06116"  # Example station ID
parameter = ["temp_dry"] 

# Define the start and end date for your data request
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 8, 21)

# Create a list to store all the data
all_data = []

# Loop through the date range in smaller intervals (e.g., monthly)
current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=30)  # Fetch data month by month
    if next_date > end_date:
        next_date = end_date

    # Format the datetime strings
    datetime_range = f"{current_date.isoformat()}Z/{next_date.isoformat()}Z"
    
    # Set up the parameters for the API request
    params = {
        "stationId": station_id,
        "datetime": datetime_range,
        "parameterId": parameter,
        "limit": 10000  # Adjust based on how much data the API returns in one call
    }

    # Make the GET request
    response = requests.get(server_url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json().get('features', [])
        all_data.extend(data)
    else:
        print(f"Failed to retrieve data for {current_date} to {next_date}: {response.status_code}, {response.text}")

    # Move to the next time interval
    current_date = next_date

# Process the collected data (e.g., convert to DataFrame for analysis)
processed_data = []
for observation in all_data:
    properties = observation.get('properties', {})
    
    # Try multiple possible keys for the timestamp
    timestamp = properties.get('observed') or properties.get('observationTime') or 'N/A'
    
    station = properties.get('stationId', 'N/A')
    temp_dry = properties.get('value', 'N/A')  # Assuming the temperature value is stored under 'value'
    
    processed_data.append({
        "timestamp": timestamp,
        "station": station,
        "temp_dry": temp_dry
    })

# Convert the processed data into a Pandas DataFrame for further analysis
df_temp = pd.DataFrame(processed_data)

# Convert the timestamp to a proper datetime format
df_temp['timestamp'] = pd.to_datetime(df_temp['timestamp'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')  # Using RFC3339 format

# Sort the DataFrame by the timestamp column
df_temp = df_temp.sort_values(by='timestamp').reset_index(drop=True)

# Making a CSV file
df_temp.to_csv('temp_raw', index=False)

# Display the first few rows of the DataFrame
print(df_temp.head())

            timestamp station  temp_dry
0 2021-01-01 00:00:00   06116       1.8
1 2021-01-01 00:10:00   06116       1.7
2 2021-01-01 00:20:00   06116       1.7
3 2021-01-01 00:30:00   06116       1.7
4 2021-01-01 00:40:00   06116       1.6


In [3]:
# Filter the DataFrame to only keep rows where the time is exactly 12:00
df_temp = df_temp[df_temp['timestamp'].dt.time == datetime.strptime("12:00", "%H:%M").time()]

# Remove the time portion, keeping only the date
df_temp['timestamp'] = df_temp['timestamp'].dt.date

# Making a CSV file
df_temp.to_csv('temp_final', index=False)

df_temp

Unnamed: 0,timestamp,station,temp_dry
72,2021-01-01,06116,2.0
216,2021-01-02,06116,1.3
360,2021-01-03,06116,3.0
504,2021-01-04,06116,2.2
648,2021-01-05,06116,2.4
...,...,...,...
180542,2024-08-16,06116,19.2
180686,2024-08-17,06116,19.3
180830,2024-08-18,06116,20.2
180974,2024-08-19,06116,19.9


# Gathering data for rain

In [4]:
# Define the station ID and parameter IDs for temp_dry
station_id = "06116"  # Example station ID
parameter = ["precip_dur_past10min"] 

# Define the start and end date for your data request
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 8, 21)

# Create a list to store all the data
all_data = []

# Loop through the date range in smaller intervals (e.g., monthly)
current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=30)  # Fetch data month by month
    if next_date > end_date:
        next_date = end_date

    # Format the datetime strings
    datetime_range = f"{current_date.isoformat()}Z/{next_date.isoformat()}Z"
    
    # Set up the parameters for the API request
    params = {
        "stationId": station_id,
        "datetime": datetime_range,
        "parameterId": parameter,
        "limit": 10000  # Adjust based on how much data the API returns in one call
    }

    # Make the GET request
    response = requests.get(server_url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json().get('features', [])
        all_data.extend(data)
    else:
        print(f"Failed to retrieve data for {current_date} to {next_date}: {response.status_code}, {response.text}")

    # Move to the next time interval
    current_date = next_date

# Process the collected data (e.g., convert to DataFrame for analysis)
processed_data = []
for observation in all_data:
    properties = observation.get('properties', {})
    
    # Try multiple possible keys for the timestamp
    timestamp = properties.get('observed') or properties.get('observationTime') or 'N/A'
    
    station = properties.get('stationId', 'N/A')
    precip_dur_past10min = properties.get('value', 'N/A')  # Assuming the temperature value is stored under 'value'
    
    processed_data.append({
        "timestamp": timestamp,
        "station": station,
        "precip_dur_past10min": precip_dur_past10min
    })

# Convert the processed data into a Pandas DataFrame for further analysis
df_precip_dur_past10min = pd.DataFrame(processed_data)

# Convert the timestamp to a proper datetime format
df_precip_dur_past10min['timestamp'] = pd.to_datetime(df_precip_dur_past10min['timestamp'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')  # Using RFC3339 format

# Sort the DataFrame by the timestamp column
df_precip_dur_past10min = df_precip_dur_past10min.sort_values(by='timestamp').reset_index(drop=True)

# Making a CSV file
df_precip_dur_past10min.to_csv('rain_raw', index=False)

# Display the first few rows of the DataFrame
print(df_precip_dur_past10min.head())

            timestamp station  precip_dur_past10min
0 2021-01-01 16:10:00   06116                   1.0
1 2021-01-06 05:40:00   06116                   1.0
2 2021-01-06 10:10:00   06116                   1.0
3 2021-01-06 10:20:00   06116                   1.0
4 2021-01-06 17:20:00   06116                   1.0


In [5]:
# Convert the timestamp column to just the date portion
df_precip_dur_past10min['date'] = df_precip_dur_past10min['timestamp'].dt.date

# Aggregate the data by date, summing the precip_dur_past10min values
df_rain_final = df_precip_dur_past10min.groupby('date')['precip_dur_past10min'].sum().reset_index()

# Making a CSV file
df_rain_final.to_csv('rain_final', index=False)

# Display the first few rows of the aggregated DataFrame
df_rain_final

Unnamed: 0,date,precip_dur_past10min
0,2021-01-01,1.0
1,2021-01-06,9.0
2,2021-01-07,30.0
3,2021-01-08,7.0
4,2021-01-09,3.0
...,...,...
990,2024-08-17,0.0
991,2024-08-18,0.0
992,2024-08-19,0.0
993,2024-08-20,0.0


# Gathering data for sun

In [6]:
# Define the station ID and parameter IDs for temp_dry
station_id = "06116"  # Example station ID
parameter = ["sun_last10min_glob"] 

# Define the start and end date for your data request
start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 8, 21)

# Create a list to store all the data
all_data = []

# Loop through the date range in smaller intervals (e.g., monthly)
current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=30)  # Fetch data month by month
    if next_date > end_date:
        next_date = end_date

    # Format the datetime strings
    datetime_range = f"{current_date.isoformat()}Z/{next_date.isoformat()}Z"
    
    # Set up the parameters for the API request
    params = {
        "stationId": station_id,
        "datetime": datetime_range,
        "parameterId": parameter,
        "limit": 10000  # Adjust based on how much data the API returns in one call
    }

    # Make the GET request
    response = requests.get(server_url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json().get('features', [])
        all_data.extend(data)
    else:
        print(f"Failed to retrieve data for {current_date} to {next_date}: {response.status_code}, {response.text}")

    # Move to the next time interval
    current_date = next_date

# Process the collected data (e.g., convert to DataFrame for analysis)
processed_data = []
for observation in all_data:
    properties = observation.get('properties', {})
    
    # Try multiple possible keys for the timestamp
    timestamp = properties.get('observed') or properties.get('observationTime') or 'N/A'
    
    station = properties.get('stationId', 'N/A')
    sun_last10min_glob = properties.get('value', 'N/A')  # Assuming the temperature value is stored under 'value'
    
    processed_data.append({
        "timestamp": timestamp,
        "station": station,
        "sun_last10min_glob": sun_last10min_glob
    })

# Convert the processed data into a Pandas DataFrame for further analysis
df_sun_last10min_glob = pd.DataFrame(processed_data)

# Convert the timestamp to a proper datetime format
df_sun_last10min_glob['timestamp'] = pd.to_datetime(df_sun_last10min_glob['timestamp'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')  # Using RFC3339 format

# Sort the DataFrame by the timestamp column
df_sun_last10min_glob = df_sun_last10min_glob.sort_values(by='timestamp').reset_index(drop=True)

# Making a CSV file
df_sun_last10min_glob.to_csv('sun_raw', index=False)

# Display the first few rows of the DataFrame
print(df_sun_last10min_glob.head())

            timestamp station  sun_last10min_glob
0 2021-01-01 00:00:00   06116                 0.0
1 2021-01-01 00:10:00   06116                 0.0
2 2021-01-01 00:20:00   06116                 0.0
3 2021-01-01 00:30:00   06116                 0.0
4 2021-01-01 00:40:00   06116                 0.0


In [7]:
# Convert the timestamp column to just the date portion
df_sun_last10min_glob['date'] = df_sun_last10min_glob['timestamp'].dt.date

# Aggregate the data by date, summing the precip_dur_past10min values
df_sun_final = df_sun_last10min_glob.groupby('date')['sun_last10min_glob'].sum().reset_index()

# Making a CSV file
df_sun_final.to_csv('sun_final', index=False)

# Display the first few rows of the aggregated DataFrame
df_sun_final

Unnamed: 0,date,sun_last10min_glob
0,2021-01-01,0.0
1,2021-01-02,0.0
2,2021-01-03,234.0
3,2021-01-04,148.0
4,2021-01-05,4.5
...,...,...
1092,2024-08-16,160.0
1093,2024-08-17,551.0
1094,2024-08-18,366.0
1095,2024-08-19,584.0


In [8]:
# Step 1: Ensure the timestamp/date columns are in datetime format
df_temp['timestamp'] = pd.to_datetime(df_temp['timestamp'])
df_rain_final['date'] = pd.to_datetime(df_rain_final['date'])
df_sun_final['date'] = pd.to_datetime(df_sun_final['date'])

# Step 2: Merge the DataFrames using df_temp_final as the base
merged_df = df_temp.merge(
    df_rain_final, how='left', left_on='timestamp', right_on='date'
).merge(
    df_sun_final, how='left', left_on='timestamp', right_on='date'
)

# Step 3: Fill NaN values with 0
merged_df = merged_df.fillna(0)

# Step 4: Drop unnecessary 'date' columns that were introduced during the merge
merged_df = merged_df.drop(columns=['date_x', 'date_y'])

merged_df.to_csv('weather_data_final', index=False)

In [9]:
merged_df

Unnamed: 0,timestamp,station,temp_dry,precip_dur_past10min,sun_last10min_glob
0,2021-01-01,06116,2.0,1.0,0.0
1,2021-01-02,06116,1.3,0.0,0.0
2,2021-01-03,06116,3.0,0.0,234.0
3,2021-01-04,06116,2.2,0.0,148.0
4,2021-01-05,06116,2.4,0.0,4.5
...,...,...,...,...,...
1223,2024-08-16,06116,19.2,173.0,160.0
1224,2024-08-17,06116,19.3,0.0,551.0
1225,2024-08-18,06116,20.2,0.0,366.0
1226,2024-08-19,06116,19.9,0.0,584.0
