In [1]:
import numpy as np
import pandas as pd
import requests
import time
import pvlib

# disable warnings from pandas
pd.options.mode.chained_assignment = None  # default='warn'

# Load Metadata

In [2]:
portugal_metadata = pd.read_csv("data/PortugalPhotovoltaicDataset/photovoltaic_dataset_metadata.csv")
uk_open_climate_fix_metadata = pd.read_csv("data/UKOpenClimateFix/uk_open_climate_fixed_metadata.csv")

# Load Data

In [3]:
uk_open_climate_fix_df = pd.read_csv("data/UKOpenClimateFix/uk_open_climate_hourly_dataset_raw.csv")
portugal_df = pd.read_csv("data/PortugalPhotovoltaicDataset/photovoltaic_dataset_raw.csv")

# Align Metadata

In [5]:
def rename_portugal_metadata_columns(metadata):
    return metadata.rename(columns={
        'PV Serial Number': 'serial',
        'Latitude': 'latitude',
        'Longitude': 'longitude',
        'optimal_tilt': 'tilt',
        'optimal_azimuth': 'azimuth',
        'Installed Power (kWp)': 'kwp',
        'From date': 'operational_date'
    })

def rename_uk_metadata_columns(metadata):
    return metadata.rename(columns={
        'ss_id': 'serial',
        'latitude_rounded': 'latitude',
        'longitude_rounded': 'longitude',
        'orientation': 'azimuth',
        'tilt': 'tilt',
        'kwp': 'kwp',
        'operational_at': 'operational_date'
    })

uk_open_climate_fix_metadata_selected = uk_open_climate_fix_metadata[uk_open_climate_fix_metadata["ss_id"].isin(uk_open_climate_fix_df["serial"])]

metadata_uk = rename_uk_metadata_columns(uk_open_climate_fix_metadata_selected)
metadata_pt = rename_portugal_metadata_columns(portugal_metadata)

metadata_uk.to_csv("data/UKOpenClimateFix/uk_pv_metadata_aligned.csv", index=False)
metadata_pt.to_csv("data/PortugalPhotovoltaicDataset/pv_plants_metadata_aligned.csv", index=False)

In [25]:
metadata_uk = pd.read_csv("data/UKOpenClimateFix/uk_pv_metadata_aligned.csv")
metadata_pt = pd.read_csv("data/PortugalPhotovoltaicDataset/pv_plants_metadata_aligned.csv")

# Fetch Weather

In [111]:
# Base URL for the Open Meteo API
open_meteo_base_url = "https://archive-api.open-meteo.com/v1/archive"

# Template for fixed parameters
fixed_params = {
    "hourly": ",".join([
        "temperature_2m",
        "apparent_temperature",
        "relative_humidity_2m",
        "dew_point_2m",
        "pressure_msl",
        "surface_pressure",
        "precipitation",
        "cloud_cover",
        "et0_fao_evapotranspiration",
        "wind_speed_10m",
        "wind_direction_10m",
        "shortwave_radiation",
        "diffuse_radiation",
        "direct_radiation",
        "direct_normal_irradiance",
        "terrestrial_radiation",
        "is_day",
        "sunshine_duration"
    ]),
}


def fetch_weather_data(latitude, longitude, azimuth, tilt, start_date, end_date):
    # Create a copy of the fixed parameters and add location and date-specific parameters
    params = fixed_params.copy()
    params.update({
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "azimuth": azimuth,
        "tilt": tilt
    })

    # Send the request to the Open Meteo API
    response = requests.get(open_meteo_base_url, params)

    # Check if the request was successful
    if response.status_code == 200:
        # Convert response to JSON
        data = response.json()
        return data  # You could print or process the data as needed
    else:
        # Print an error message if the request failed
        print(f"Error: {response.status_code} - {response.text}")
        return None

## Fetch Portugal Weather Data

In [7]:
portugal_weather_df = []

for _, solar_plant in portugal_metadata.iterrows():
    weather_data = fetch_weather_data(latitude=solar_plant["Latitude"], longitude=solar_plant["Longitude"], start_date=solar_plant["From date"], end_date=solar_plant["To date"], azimuth=solar_plant["optimal_azimuth"], tilt=solar_plant["optimal_tilt"])
    hourly_data = weather_data["hourly"]
    hourly_data["serial"] = solar_plant["PV Serial Number"]
    portugal_weather_df.append(pd.DataFrame(hourly_data))
    time.sleep(30) # to avoid minutely limit

portugal_weather_df = pd.concat(portugal_weather_df, ignore_index=True)

# Fetch UK Weather Data

In [9]:
uk_open_climate_fix_df['serial'] = uk_open_climate_fix_df['serial'].astype(str)
uk_open_climate_fix_metadata['ss_id'] = uk_open_climate_fix_metadata['ss_id'].astype(str)

uk_open_climate_fix_weather_df = []

def get_start_end_dates(serial):
        # Filter the DataFrame for the specific solar station ID
    temp = uk_open_climate_fix_df[uk_open_climate_fix_df['serial'] == serial].copy()
    
    # Ensure the DataFrame is not empty
    if temp.empty:
        return None, None  # Return None if no data is found for the given ss_id
    
    # Create a datetime column from year, month, and day
    temp['datetime'] = pd.to_datetime(temp[['year', 'month', 'day', 'hour', 'minute']])

    # Determine the start and end dates
    start_date = temp['datetime'].min().date()
    end_date = temp['datetime'].max().date()
    
    return start_date, end_date


uk_open_climate_fix_metadata_selected = uk_open_climate_fix_metadata[uk_open_climate_fix_metadata["ss_id"].isin(uk_open_climate_fix_df["serial"])]
    

for _, solar_plant in uk_open_climate_fix_metadata_selected.iterrows():
    start_date, end_date = get_start_end_dates(solar_plant["ss_id"])
    
    weather_data = fetch_weather_data(latitude=solar_plant["latitude_rounded"], longitude=solar_plant["longitude_rounded"], start_date=start_date, end_date=end_date, azimuth=solar_plant["orientation"], tilt=solar_plant["tilt"])
    hourly_data = weather_data["hourly"]
    hourly_data["serial"] = solar_plant["ss_id"]
    uk_open_climate_fix_weather_df.append(pd.DataFrame(hourly_data))
    time.sleep(30) # to avoid minutely limit

uk_open_climate_fix_weather_df = pd.concat(uk_open_climate_fix_weather_df, ignore_index=True)

# Save Weather Data

In [112]:
portugal_weather_df.to_csv("data/PortugalPhotovoltaicDataset/weather.csv", index=False)
uk_open_climate_fix_weather_df.to_csv("data/UKOpenClimateFix/weather.csv", index=False)

# Merge Weather Data

In [113]:
# PORTUGAL: merge weather data with solar datasets 
solar_data = portugal_df
weather_data = portugal_weather_df

solar_data['time'] = pd.to_datetime(solar_data[['year', 'month', 'day', 'hour', 'minute']])

# Ensure 'serial' columns have the same type
solar_data['serial'] = solar_data['serial'].astype(str)
weather_data['serial'] = weather_data['serial'].astype(str)

# Ensure the keys are formatted consistently
solar_data['time'] = pd.to_datetime(solar_data['time']).dt.tz_localize(None)  # Remove timezone info
weather_data['time'] = pd.to_datetime(weather_data['time'])  # Keep timezone if needed

# Merge on 'serial' and 'datetime'
portugal_combined_data = pd.merge(
    solar_data,
    weather_data,
    left_on=['serial', 'time'],  # solar_data keys
    right_on=['serial', 'time'],  # weather_data keys
    how='inner'  # Inner join to keep only matches
)

portugal_combined_data.drop(columns={"time"}, inplace=True)

portugal_combined_data.to_csv("data/PortugalPhotovoltaicDataset/combined_dataset_weather.csv", index=False)

In [114]:
# UK Open Climate: merge weather data with solar datasets 
solar_data = uk_open_climate_fix_df
weather_data = uk_open_climate_fix_weather_df

solar_data['time'] = pd.to_datetime(solar_data[['year', 'month', 'day', 'hour', 'minute']])

# Ensure 'serial' columns have the same type
solar_data['serial'] = solar_data['serial'].astype(str)
weather_data['serial'] = weather_data['serial'].astype(str)

# Ensure the keys are formatted consistently
solar_data['time'] = pd.to_datetime(solar_data['time']).dt.tz_localize(None)  # Remove timezone info
weather_data['time'] = pd.to_datetime(weather_data['time'])  # Keep timezone if needed

# Merge on 'serial' and 'datetime'
uk_open_climate_fix_combined_data = pd.merge(
    solar_data,
    weather_data,
    left_on=['serial', 'time'],  # solar_data keys
    right_on=['serial', 'time'],  # weather_data keys
    how='inner'  # Inner join to keep only matches
)

uk_open_climate_fix_combined_data.drop(columns={"time"}, inplace=True)

uk_open_climate_fix_combined_data.to_csv("data/UKOpenClimateFix/combined_dataset_weather.csv", index=False)

# Fetch PVGIS Data

In [115]:
def fetch_pvgis_data(lat, lon, peakpower, loss, outputformat="json", startyear=None, endyear=None,  pvcalculation=0):
    """
    Fetch data from the PVGIS API using required and selected optional parameters.

    Parameters:
        lat (float): Latitude in decimal degrees.
        lon (float): Longitude in decimal degrees.
        peakpower (float): Nominal power of the PV system in kW.
        loss (float): Sum of system losses in percent.
        outputformat (str, optional): Output format, "csv" or "json". Defaults to "json".
        startyear (int, optional): First year of the output. Defaults to None.
        endyear (int, optional): Last year of the output. Defaults to None.
        pvcalculation (int, optional): 1 for PV production estimation, 0 for solar radiation only. Defaults to 0.

    Returns:
        dict or str: Response data in JSON format if "json" is selected, otherwise CSV text.
    """
    base_url = "https://re.jrc.ec.europa.eu/api/seriescalc"
    params = {
        "lat": lat,
        "lon": lon,
        "peakpower": peakpower,
        "loss": loss,
        "outputformat": outputformat,
        "startyear": startyear,
        "endyear": endyear,
        "pvcalculation": pvcalculation,
    }

    # Remove None values to keep the request clean
    params = {key: value for key, value in params.items() if value is not None}

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors
        if outputformat == "json":
            return response.json()
        else:
            return response.text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [116]:
# Initialize an empty list to store data for all stations
portugal_pvgis = []

# Loop through each solar plant in the metadata
for _, solar_plant in portugal_metadata.iterrows():
    # Fetch data for the current solar plant
    data = fetch_pvgis_data(
        lat=solar_plant["Latitude"],
        lon=solar_plant["Longitude"],
        peakpower=solar_plant["Installed Power (kWp)"],
        loss=10,  # Assume 10% loss
        outputformat="json",  # Ensure we get structured data
        startyear=2019,
        endyear=2022,
        pvcalculation=1
    )
    
    # Process the fetched data
    if data:
        # Extract hourly data
        hourly_data = pd.DataFrame(data.get("outputs", {}).get("hourly", []))
        if not hourly_data.empty:
            # Add metadata for the solar plant
            hourly_data["serial"] = solar_plant["PV Serial Number"]
            hourly_data["latitude"] = solar_plant["Latitude"]
            hourly_data["longitude"] = solar_plant["Longitude"]
            hourly_data["kwp"] = solar_plant["Installed Power (kWp)"]
            
            # Convert datetime format from "20190101:0010" to "YYYY-MM-DD HH:MM"
            hourly_data["time"] = pd.to_datetime(hourly_data["time"], format="%Y%m%d:%H%M")
            hourly_data["time"] = hourly_data["time"].dt.round('h')  # Round to the nearest hour
            
            hourly_data['year'] = hourly_data['time'].dt.year
            hourly_data['month'] = hourly_data['time'].dt.month
            hourly_data['day'] = hourly_data['time'].dt.day
            hourly_data['hour'] = hourly_data['time'].dt.hour
            hourly_data['minute'] = hourly_data['time'].dt.minute
            
            hourly_data.drop(columns=["time"], inplace=True)
            
            # Append to the list of all data
            portugal_pvgis.append(hourly_data)
        else:
            print(f"No hourly data found for station {solar_plant['PV Serial Number']}")
    else:
        print(f"Failed to fetch data for station {solar_plant['PV Serial Number']}")

# Combine all station data into a single DataFrame
portugal_pvgis_data = pd.concat(portugal_pvgis, ignore_index=True)
# convent to kwh
portugal_pvgis_data["P"] = round((portugal_pvgis_data["P"] / 1000).astype(float), 2)

# Save to a single CSV file
pvgis_data_file_name = "data/PortugalPhotovoltaicDataset/pvgis_data.csv"
portugal_pvgis_data.to_csv(pvgis_data_file_name, index=False)
print(f"Combined data saved to {pvgis_data_file_name}")

Combined data saved to data/PortugalPhotovoltaicDataset/pvgis_data.csv


In [117]:
# Initialize an empty list to store data for all stations
uk_open_climate_pvgis = []

uk_open_climate_fix_metadata['ss_id'] = uk_open_climate_fix_metadata['ss_id'].astype(str)
uk_open_climate_fix_df['serial'] = uk_open_climate_fix_df['serial'].astype(str)

uk_open_climate_fix_metadata_selected = uk_open_climate_fix_metadata[uk_open_climate_fix_metadata["ss_id"].isin(uk_open_climate_fix_df["serial"])]

# Loop through each solar plant in the metadata
for _, solar_plant in uk_open_climate_fix_metadata_selected.iterrows():
    start_date, end_date = get_start_end_dates(solar_plant["ss_id"])
    # Fetch data for the current solar plant
    data = fetch_pvgis_data(
        lat=solar_plant["latitude_rounded"],
        lon=solar_plant["longitude_rounded"],
        peakpower=solar_plant["kwp"],
        loss=10,  # Assume 10% loss
        outputformat="json",  # Ensure we get structured data
        startyear=start_date.year,
        endyear=end_date.year,
        pvcalculation=1
    )
    
    # Process the fetched data
    if data:
        # Extract hourly data
        hourly_data = pd.DataFrame(data.get("outputs", {}).get("hourly", []))
        if not hourly_data.empty:
            # Add metadata for the solar plant
            hourly_data["serial"] = solar_plant["ss_id"]
            hourly_data["latitude"] = solar_plant["latitude_rounded"]
            hourly_data["longitude"] = solar_plant["longitude_rounded"]
            hourly_data["kwp"] = solar_plant["kwp"]
            
            # Convert datetime format from "20190101:0010" to "YYYY-MM-DD HH:MM"
            hourly_data["time"] = pd.to_datetime(hourly_data["time"], format="%Y%m%d:%H%M")
            hourly_data["time"] = hourly_data["time"].dt.round('h')  # Round to the nearest hour
            
            hourly_data['year'] = hourly_data['time'].dt.year
            hourly_data['month'] = hourly_data['time'].dt.month
            hourly_data['day'] = hourly_data['time'].dt.day
            hourly_data['hour'] = hourly_data['time'].dt.hour
            hourly_data['minute'] = hourly_data['time'].dt.minute
            
            hourly_data.drop(columns=["time"], inplace=True)
            
            # Append to the list of all data
            uk_open_climate_pvgis.append(hourly_data)
        else:
            print(f"No hourly data found for station {solar_plant['PV Serial Number']}")
    else:
        print(f"Failed to fetch data for station {solar_plant['PV Serial Number']}")

# Combine all station data into a single DataFrame
uk_open_climate_pvgis_data = pd.concat(uk_open_climate_pvgis, ignore_index=True)
# convent to kwh
uk_open_climate_pvgis_data["P"] = round((uk_open_climate_pvgis_data["P"] / 1000).astype(float), 2)

# Save to a single CSV file
pvgis_data_file_name = "data/UKOpenClimateFix/pvgis_data.csv"
uk_open_climate_pvgis_data.to_csv(pvgis_data_file_name, index=False)
print(f"Combined data saved to {pvgis_data_file_name}")

Combined data saved to data/UKOpenClimateFix/pvgis_data.csv


# Merge PVGIS Data

In [118]:
portugal_pvgis_data = pd.read_csv("data/PortugalPhotovoltaicDataset/pvgis_data.csv")
uk_open_climate_pvgis_data = pd.read_csv("data/UKOpenClimateFix/pvgis_data.csv")

In [119]:
portugal_combined_data['serial'] = portugal_combined_data['serial'].astype(str)
portugal_pvgis_data['serial'] = portugal_pvgis_data['serial'].astype(str)

# Merge the real and predicted data for Portugal
portugal_combined_with_pvgis = pd.merge(
    portugal_combined_data,
    portugal_pvgis_data[["year", "month", "day", "hour", "serial", "P"]],
    left_on=["serial", "year", "month", "day", "hour"], # portugal_data_with_weather keys
    right_on=["serial", "year", "month", "day", "hour"], # portugal_pvgis_data keys
    how="inner"
)

# portugal_combined_with_pvgis.drop(columns={"Serial", "time", "PV Serial Number"}, inplace=True)
portugal_combined_with_pvgis.rename(columns={"P": "pvgis predicted energy"}, inplace=True)

print(f"Number of matching rows: {len(portugal_combined_with_pvgis)}")

portugal_combined_with_pvgis.to_csv("data/PortugalPhotovoltaicDataset/combined_dataset_weather_pvgis.csv", index=False)

Number of matching rows: 315567


In [120]:
uk_open_climate_fix_combined_data['serial'] = uk_open_climate_fix_combined_data['serial'].astype(str)
uk_open_climate_pvgis_data['serial'] = uk_open_climate_pvgis_data['serial'].astype(str)

uk_open_climate_fix_combined_data['serial'] = uk_open_climate_fix_combined_data['serial'].str.strip()
uk_open_climate_pvgis_data['serial'] = uk_open_climate_pvgis_data['serial'].str.strip()

# Merge the real and predicted data for Portugal
uk_open_climate_combined_with_pvgis = pd.merge(
    uk_open_climate_fix_combined_data,
    uk_open_climate_pvgis_data[["year", "month", "day", "hour", "serial", "P"]],
    left_on=["serial", "year", "month", "day", "hour"], # portugal_data_with_weather keys
    right_on=["serial", "year", "month", "day", "hour"], # portugal_pvgis_data keys
    how="inner"
)

uk_open_climate_combined_with_pvgis.rename(columns={"P": "pvgis predicted energy"}, inplace=True)

print(f"Number of matching rows: {len(uk_open_climate_combined_with_pvgis)}")

uk_open_climate_combined_with_pvgis.to_csv("data/UKOpenClimateFix/combined_dataset_weather_pvgis.csv", index=False)

Number of matching rows: 803134


# Derived Features

In [12]:
def add_solar_position(df, metadata):
    """
    Add solar position (zenith, azimuth, elevation) to the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing datetime and serial.
        metadata (pd.DataFrame): Metadata DataFrame containing latitude and longitude for each solar plant.
    
    Returns:
        pd.DataFrame: DataFrame with solar position added.
    """
    df["serial"] = df["serial"].astype(str)
    metadata["serial"] = metadata["serial"].astype(str)

    # Ensure datetime is in the correct format and set as index
    df["datetime"] = pd.to_datetime(df[["year", "month", "day", "hour", "minute"]])
    df.set_index("datetime", inplace=True)

    all_results = []

    for _, solar_plant in metadata.iterrows():
        plant_df = df[df["serial"] == solar_plant["serial"]]

        # Calculate solar position
        solar_position = pvlib.solarposition.get_solarposition(
            time=plant_df.index,
            latitude=solar_plant["latitude"],
            longitude=solar_plant["longitude"]
        )


        # Add solar position to plant data
        plant_df["solar_zenith"] = solar_position["apparent_zenith"].round(2)
        plant_df["solar_azimuth"] = solar_position["azimuth"].round(2)
        plant_df["solar_elevation"] = solar_position["elevation"].round(2)
        
        all_results.append(plant_df)

    combined_results = pd.concat(all_results)
    combined_results.reset_index(inplace=True)  # Reset the datetime index if needed
    return combined_results

In [13]:
def add_poa_irradiance(df, metadata):
    """
    Add Plane of Array (POA) irradiance to the DataFrame based on solar position and metadata.
    
    Args:
        df (pd.DataFrame): DataFrame containing solar position (zenith, azimuth).
        metadata (pd.DataFrame): Metadata DataFrame containing tilt and azimuth for each solar plant.
    
    Returns:
        pd.DataFrame: DataFrame with POA irradiance added.
    """
    all_results = []

    for _, solar_plant in metadata.iterrows():
        plant_df = df[df["serial"] == solar_plant["serial"]]
        print(solar_plant)

        # Calculate POA irradiance
        poa_irradiance = pvlib.irradiance.get_total_irradiance(
            surface_tilt=solar_plant["tilt"],
            surface_azimuth=solar_plant["azimuth"],
            dni=plant_df["direct_normal_irradiance"],
            ghi=plant_df["shortwave_radiation"],
            dhi=plant_df["diffuse_radiation"],
            solar_zenith=plant_df["solar_zenith"],
            solar_azimuth=plant_df["solar_azimuth"]
        )

        # Add POA to plant data
        plant_df["poa"] = poa_irradiance["poa_global"].round(2)

        all_results.append(plant_df)

    combined_results = pd.concat(all_results)
    combined_results.reset_index(inplace=True, drop=True)
    return combined_results

In [14]:
def add_lagged_features(data, feature='poa', hours_before=1, hours_after=1, calculate_average=False, serial_col='serial', datetime_col='datetime', decimals=2):
    """
    Add lagged and averaged features to the dataset.
    
    Parameters:
    - data (DataFrame): The input data containing 'serial' and 'datetime'.
    - feature (str): The feature for which lagged values will be created.
    - hours_before (int): Number of hours to shift backward.
    - hours_after (int): Number of hours to shift forward.
    
    Returns:
    - DataFrame: The data with lagged and average features added.
    """
    
    # Step 1: Sort data by serial and datetime to ensure correct shifts
    data[datetime_col] = pd.to_datetime(data[datetime_col])
    
    data = data.sort_values(by=[serial_col, datetime_col]).reset_index(drop=True)

    # Step 2: Add lagged features for specified hours before and after
    for i in range(1, hours_before + 1):
        data[f'{feature}_hour_minus_{i}'] = data.groupby(serial_col)[feature].shift(i)

    for i in range(1, hours_after + 1):
        data[f'{feature}_hour_plus_{i}'] = data.groupby(serial_col)[feature].shift(-i)

    # Step 3: Calculate the average of neighboring values
    lagged_cols = [f'{feature}_hour_minus_{i}' for i in range(1, hours_before + 1)] + \
                  [feature] + \
                  [f'{feature}_hour_plus_{i}' for i in range(1, hours_after + 1)]
    
    if calculate_average:
        data[f'{feature}_avg_neighbor'] = data[lagged_cols].mean(axis=1).round(decimals)

    # Step 4: Handle NaN values
    data.dropna(subset=lagged_cols, inplace=True)

    return data

In [15]:
def add_moving_average(data, column, datetime_col, window_size, metadata, serial_col='serial'):
    """
    Add a moving average column to the dataframe.
    """
    data[datetime_col] = pd.to_datetime(data[datetime_col])
    new_data = []
    
    for _, row in metadata.iterrows():
        serial = row[serial_col]
        
        selected_data = data[data[serial_col] == serial].copy()
        
        if selected_data is None:
            continue
        
        column_name = f"{column}_{window_size}_moving_average"
        selected_data[column_name] = selected_data.rolling(
            f'{window_size}h', 
            on=datetime_col, 
            closed='right'
        )[column].mean().round(2)
        
        # drop NaN values after calculating the moving average
        selected_data.dropna(inplace=True)
        
        new_data.append(selected_data)
    
    return pd.concat(new_data)

In [16]:
def add_clear_sky_index(data, metadata):
    updated_data = []
    
    for _, row in metadata.iterrows():
        latitude = row['latitude']
        longitude = row['longitude']
        
        selected_data = data[data["serial"] == row["serial"]].copy()
        
        if selected_data.empty:
            continue
        
        location = pvlib.location.Location(latitude=latitude, longitude=longitude)
        
        times = pd.date_range(start=selected_data['datetime'].min(), end=selected_data['datetime'].max(), freq='1h')
        
        cs = location.get_clearsky(times, model='ineichen')
        cs = cs.set_index(times)  # Set times as index to match selected_data
        cs = cs.reindex(selected_data.set_index('datetime').index)  # Reindex to ensure alignment
        
        selected_data = selected_data.set_index('datetime')
        selected_data['clearsky_index'] = pvlib.irradiance.clearsky_index(selected_data["shortwave_radiation"], cs['ghi'])
        
        selected_data.reset_index(inplace=True)
        
        updated_data.append(selected_data)
        
    return pd.concat(updated_data)

In [17]:
def calculate_cell_temperature(poa, temperature_2m, wind_speed, decimals=2):
    # since we don't have the module parameters, we use the default parameters for now
    temperature_model_params = pvlib.temperature.TEMPERATURE_MODEL_PARAMETERS['sapm']['open_rack_glass_polymer']  
    cell_temperature = pvlib.temperature.sapm_cell(
        poa_global=poa,
        temp_air=temperature_2m,
        wind_speed=wind_speed,
        a=temperature_model_params['a'],
        b=temperature_model_params['b'],
        deltaT=temperature_model_params['deltaT']
    )
    
    return cell_temperature.round(decimals)

In [18]:
def add_cyclic_encoding(df, column, period, decimals=None):
    """
    Adds cyclic encoding for a specified column with optional rounding.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column (str): The column to apply cyclic encoding to.
    period (int): The period for cyclic encoding (e.g., 24 for hours, 365 for days of year).
    decimals (int or None): The number of decimal places to round to. If None, no rounding.

    Returns:
    pd.DataFrame: The DataFrame with added sine and cosine columns.
    """
    df[f'{column}_sin'] = np.sin(2 * np.pi * df[column] / period)
    df[f'{column}_cos'] = np.cos(2 * np.pi * df[column] / period)
    
    if decimals is not None:
        df[f'{column}_sin'] = df[f'{column}_sin'].round(decimals)
        df[f'{column}_cos'] = df[f'{column}_cos'].round(decimals)
    
    return df

In [19]:
def add_joules_hourly(data, irradiance_col):
    """
    Calculates the energy in Joules for each hourly measurement in the dataset.

    Parameters:
    data (pd.DataFrame): The input dataset containing irradiance values.
    irradiance_col (str): The column name containing the irradiance values in W/m².

    Returns:
    pd.DataFrame: The input DataFrame with an additional column 'joules' representing the calculated energy.
    """
    # 1 hour = 3600 seconds
    seconds_in_hour = 3600

    # Calculate Joules for each row: W/m² * 3600 seconds
    data['joules'] = data[irradiance_col] * seconds_in_hour
    
    return data

In [20]:
def categories_cloud_cover(data):
    """
    Categorizes cloud cover into bins: 'low', 'medium', 'high', 'very high'.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'cloud_cover' column.

    Returns:
    - pd.DataFrame: The DataFrame with a new column 'cloud_cover_category'.
    """
    data['cloud_cover_category'] = pd.cut(
        data['cloud_cover'], 
        bins=[0, 25, 50, 75, 100], 
        labels=['low', 'medium', 'high', 'very high'], 
        include_lowest=True  # Ensures 0 is included in the 'low' bin
    )
    return data

In [21]:
def add_physical_model_prediction(data, poa_column, temperature_column, kwp_column, inverter_efficiency):
    """
    Add a column with the predicted energy output using a physical model.

    Parameters:
    - data (pd.DataFrame): The input DataFrame.
    - poa_column (str): The column name for the Plane of Array (POA) irradiance.
    - temperature_column (str): The column name for the cell temperature.
    - kwp_column (str): The column name for the installed power in kWp.
    - inverter_efficiency (float): The efficiency of the inverter.

    Returns:
    - pd.DataFrame: The DataFrame with an additional column 'physical_model_prediction'.
    """
    # Calculate the energy output using the physical model
    data['physical_model_prediction'] = pvlib.pvsystem.pvwatts_dc(
        g_poa_effective=data[poa_column],
        temp_cell=data[temperature_column],
        pdc0=data[kwp_column],
        gamma_pdc=-0.004,
    )
    
    # Apply the inverter efficiency
    data['physical_model_prediction'] *= inverter_efficiency
    data['physical_model_prediction'] = data['physical_model_prediction'].round(2)
    
    data['relative_physical_model_prediction'] = data['physical_model_prediction'] / data[kwp_column]
    
    return data


In [22]:
def add_region(data, region):
    data["region"] = region

    return data

In [23]:
def add_relative_output(data, production_col, kwp_col):
    data['relative_output'] = data[production_col] / data[kwp_col]
    
    return data

# Add derived features

In [5]:
portugal_combined_with_pvgis = pd.read_csv("data/PortugalPhotovoltaicDataset/combined_dataset_weather_pvgis.csv")

In [7]:
uk_open_climate_fix_with_pvgis = pd.read_csv("data/UKOpenClimateFix/combined_dataset_weather_pvgis.csv")

## Portugal

In [31]:
portugal_df_featured = portugal_combined_with_pvgis.copy()

portugal_df_featured["datetime"] = pd.to_datetime(portugal_df_featured[["year", "month", "day", "hour", "minute"]])
portugal_df_featured["serial"] = portugal_df_featured["serial"].astype(str)
portugal_df_featured["day_of_year"] = portugal_df_featured["datetime"].dt.dayofyear

portugal_df_featured = add_solar_position(portugal_df_featured, metadata_pt)

portugal_df_featured = add_poa_irradiance(portugal_df_featured, metadata_pt)
portugal_df_featured = add_lagged_features(portugal_df_featured, "poa", hours_before=1, hours_after=0, serial_col='serial', datetime_col='datetime')

portugal_df_featured = add_clear_sky_index(portugal_df_featured, metadata_pt)
portugal_df_featured = add_lagged_features(portugal_df_featured, "cloud_cover", hours_before=1, hours_after=0, serial_col='serial', datetime_col='datetime')
portugal_df_featured = add_moving_average(portugal_df_featured, "cloud_cover", "datetime", 3, metadata_pt)

portugal_df_featured = add_joules_hourly(portugal_df_featured, 'shortwave_radiation')

portugal_df_featured = add_cyclic_encoding(portugal_df_featured, 'hour', 24, decimals=3)
portugal_df_featured = add_cyclic_encoding(portugal_df_featured, 'day_of_year', 365, decimals=3)
portugal_df_featured = add_cyclic_encoding(portugal_df_featured, 'month', 12, decimals=3)

portugal_df_featured["cell_temp"] = calculate_cell_temperature(portugal_df_featured['poa'], portugal_df_featured['temperature_2m'], portugal_df_featured['wind_speed_10m'], decimals=2)

portugal_df_featured = add_physical_model_prediction(portugal_df_featured, 'poa', 'cell_temp', 'kwp', 0.96)

portugal_df_featured = add_relative_output(portugal_df_featured, "produced energy", "kwp")

portugal_df_featured = add_region(portugal_df_featured, "Portugal")


portugal_df_featured

serial                      84071567
Location                      Lisbon
latitude                      38.728
longitude                     -9.138
operational_date          2019-01-01
To date                   2022-12-31
kwp                             46.0
Connection Power (kWn)          40.0
tilt                            35.0
azimuth                        170.0
Name: 0, dtype: object
serial                      84071569
Location                      Lisbon
latitude                      38.833
longitude                     -9.191
operational_date          2019-01-01
To date                   2022-12-31
kwp                            16.32
Connection Power (kWn)          15.0
tilt                            35.0
azimuth                        170.0
Name: 1, dtype: object
serial                      84071568
Location                     Setubal
latitude                      38.577
longitude                     -8.872
operational_date          2019-01-01
To date                   202

Unnamed: 0,datetime,serial,name,produced energy,kwp,year,month,day,hour,minute,...,hour_cos,day_of_year_sin,day_of_year_cos,month_sin,month_cos,cell_temp,physical_model_prediction,relative_physical_model_prediction,relative_output,region
175311,2019-01-01 02:00:00,84071567,Lisbon_1,0.0,46.00,2019,1,1,2,0,...,0.866,0.017,1.0,0.5,0.866,8.8,0.0,0.0,0.0,Portugal
175312,2019-01-01 03:00:00,84071567,Lisbon_1,0.0,46.00,2019,1,1,3,0,...,0.707,0.017,1.0,0.5,0.866,8.5,0.0,0.0,0.0,Portugal
175313,2019-01-01 04:00:00,84071567,Lisbon_1,0.0,46.00,2019,1,1,4,0,...,0.500,0.017,1.0,0.5,0.866,8.3,0.0,0.0,0.0,Portugal
175314,2019-01-01 05:00:00,84071567,Lisbon_1,0.0,46.00,2019,1,1,5,0,...,0.259,0.017,1.0,0.5,0.866,8.0,0.0,0.0,0.0,Portugal
175315,2019-01-01 06:00:00,84071567,Lisbon_1,0.0,46.00,2019,1,1,6,0,...,0.000,0.017,1.0,0.5,0.866,7.7,0.0,0.0,0.0,Portugal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140243,2022-12-31 19:00:00,73061935,Loule,0.0,46.25,2022,12,31,19,0,...,0.259,0.000,1.0,-0.0,1.000,14.8,0.0,0.0,0.0,Portugal
140244,2022-12-31 20:00:00,73061935,Loule,0.0,46.25,2022,12,31,20,0,...,0.500,0.000,1.0,-0.0,1.000,14.7,0.0,0.0,0.0,Portugal
140245,2022-12-31 21:00:00,73061935,Loule,0.0,46.25,2022,12,31,21,0,...,0.707,0.000,1.0,-0.0,1.000,14.9,0.0,0.0,0.0,Portugal
140246,2022-12-31 22:00:00,73061935,Loule,0.0,46.25,2022,12,31,22,0,...,0.866,0.000,1.0,-0.0,1.000,14.9,0.0,0.0,0.0,Portugal


## UK Open Climate Fix

In [32]:
uk_df_featured = uk_open_climate_fix_with_pvgis.copy()

uk_df_featured["datetime"] = pd.to_datetime(uk_df_featured[["year", "month", "day", "hour", "minute"]])
uk_df_featured["serial"] = uk_df_featured["serial"].astype(str)

uk_df_featured = add_solar_position(uk_df_featured, metadata_uk)
uk_df_featured = add_poa_irradiance(uk_df_featured, metadata_uk)
uk_df_featured = add_lagged_features(uk_df_featured, "poa", hours_before=1, hours_after=0, serial_col='serial', datetime_col='datetime')

uk_df_featured = add_clear_sky_index(uk_df_featured, metadata_uk)
uk_df_featured = add_lagged_features(uk_df_featured, "cloud_cover", hours_before=1, hours_after=0, serial_col='serial', datetime_col='datetime')
uk_df_featured = add_moving_average(uk_df_featured, "cloud_cover", "datetime", 3, metadata_uk)

uk_df_featured = add_joules_hourly(uk_df_featured, 'shortwave_radiation')

uk_df_featured = add_cyclic_encoding(uk_df_featured, 'hour', 24, decimals=3)
uk_df_featured["day_of_year"] = uk_df_featured["datetime"].dt.dayofyear
uk_df_featured = add_cyclic_encoding(uk_df_featured, 'day_of_year', 365, decimals=3)
uk_df_featured = add_cyclic_encoding(uk_df_featured, 'month', 12, decimals=3)

uk_df_featured["cell_temp"] = calculate_cell_temperature(uk_df_featured['poa'], uk_df_featured['temperature_2m'], uk_df_featured['wind_speed_10m'], decimals=2)

uk_df_featured = add_physical_model_prediction(uk_df_featured, 'poa', 'cell_temp', 'kwp', 0.96)

uk_df_featured = add_relative_output(uk_df_featured, "produced energy", "kwp")

uk_df_featured = add_region(uk_df_featured, "UK")
uk_df_featured

serial                    3062
latitude                 53.87
longitude                 -3.0
llsoacd              E01025577
azimuth                  175.0
tilt                      40.0
kwp                        4.0
operational_date    2011-11-02
Name: 0, dtype: object
serial                    3800
latitude                 52.32
longitude                  0.5
llsoacd              E01029945
azimuth                  180.0
tilt                      18.0
kwp                      49.82
operational_date    2012-01-18
Name: 1, dtype: object
serial                    7115
latitude                 54.87
longitude                -1.44
llsoacd              E01008772
azimuth                  180.0
tilt                      30.0
kwp                       3.92
operational_date    2013-03-17
Name: 2, dtype: object
serial                    7497
latitude                 50.19
longitude                 -5.3
llsoacd              E01018872
azimuth                  245.0
tilt                      10.0
k

Unnamed: 0,datetime,serial,produced energy,kwp,year,month,day,hour,minute,temperature_2m,...,day_of_year,day_of_year_sin,day_of_year_cos,month_sin,month_cos,cell_temp,physical_model_prediction,relative_physical_model_prediction,relative_output,region
371296,2011-11-02 02:00:00,3062,0.0,4.0,2011,11,2,2,0,9.2,...,306,-0.85,0.527,-0.5,0.866,9.2,0.0,0.0,0.0,UK
371297,2011-11-02 03:00:00,3062,0.0,4.0,2011,11,2,3,0,9.3,...,306,-0.85,0.527,-0.5,0.866,9.3,0.0,0.0,0.0,UK
371298,2011-11-02 04:00:00,3062,0.0,4.0,2011,11,2,4,0,9.4,...,306,-0.85,0.527,-0.5,0.866,9.4,0.0,0.0,0.0,UK
371299,2011-11-02 05:00:00,3062,0.0,4.0,2011,11,2,5,0,9.7,...,306,-0.85,0.527,-0.5,0.866,9.7,0.0,0.0,0.0,UK
371300,2011-11-02 06:00:00,3062,0.0,4.0,2011,11,2,6,0,10.1,...,306,-0.85,0.527,-0.5,0.866,10.1,0.0,0.0,0.0,UK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371290,2021-09-30 19:00:00,26754,0.0,3.5,2021,9,30,19,0,12.1,...,273,-1.00,-0.013,-1.0,-0.000,12.1,0.0,0.0,0.0,UK
371291,2021-09-30 20:00:00,26754,0.0,3.5,2021,9,30,20,0,11.7,...,273,-1.00,-0.013,-1.0,-0.000,11.7,0.0,0.0,0.0,UK
371292,2021-09-30 21:00:00,26754,0.0,3.5,2021,9,30,21,0,11.5,...,273,-1.00,-0.013,-1.0,-0.000,11.5,0.0,0.0,0.0,UK
371293,2021-09-30 22:00:00,26754,0.0,3.5,2021,9,30,22,0,11.6,...,273,-1.00,-0.013,-1.0,-0.000,11.6,0.0,0.0,0.0,UK


In [33]:
portugal_df_featured.to_csv("data/PortugalPhotovoltaicDataset/dataset_feature_engineering.csv", index=False)

In [34]:
uk_df_featured.to_csv("data/UKOpenClimateFix/dataset_feature_engineering.csv", index=False)