# Download Plant Data from Zenodo

## Improvements
1. Clean up and functionise/classify
2. Format for easy reading into OpenOA

In [1]:
import requests
import os
import hashlib

from pathlib import Path
from zipfile import ZipFile
import pandas as pd

In [2]:
def download_file(url,outfile):
    # download a file from the web based on its url
    
    get_response = requests.get(url,stream=True)
    
    chunk_number = 0
    with open(outfile, 'wb') as f:
        
        for chunk in get_response.iter_content(chunk_size=1024*1024):
            
            chunk_number = chunk_number + 1
            
            print(str(chunk_number) + ' MB downloaded', end='\r')
            
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [3]:
def download_zenodo_data(record_id,outfile_path):
    # download data from zenodo based on the zenodo record_id
    #
    # outputs:
    # 1. record_details.json, which details the zenodo api details
    # 2. all files available for the record_id
    
    
    url_zenodo = r'https://zenodo.org/api/records/'

    record_id = str(record_id)
    
    r = requests.get(url_zenodo + record_id)
    
    r_json = r.json()
    
    
    print('======')
    print('Title: ' + r_json['metadata']['title'])
    print('Version: ' + r_json['metadata']['version'])
    print('URL: ' + r_json['links']['latest_html'])
    print('Record DOI: ' + r_json['doi'])
    print('License: ' + r_json['metadata']['license']['id'])
    print('======\n')
    
       
    # create outfile_path if it does not exist
    if not os.path.exists(outfile_path):
        os.makedirs(outfile_path)
    
    
    # save record details to json file
    outfile = outfile_path + 'record_details.json'
    
    with open(outfile, 'wb') as f:
        f.write(r.content)

        
    # download all files
    files = r_json['files']
    for f in files:
        
        url_file = f['links']['self']
        
        file_name = f['key']
                
        outfile = outfile_path + file_name
        
        
        # check if file exists
        if os.path.exists(outfile):
            
            
            # if it does check the checksum is correct
            with open(outfile, 'rb') as f_check:
                file_hash = hashlib.md5()
                while chunk := f_check.read(8192):
                    file_hash.update(chunk)
        
            if f['checksum'][4:]==file_hash.hexdigest():
                print('File already exists: ' + file_name)
            
            
            # download if the checksum isn't correct
            else:
                
                print('Downloading: ' + file_name)
                print('File size: ' + str(round(f['size']/(1024*1024),2)) + 'MB')       

                download_file(url_file,outfile)

                print('Saved to: ' + outfile + '\n')
        
        
        # download if the file doesn't exist
        else:
            
            print('\nDownloading: ' + file_name)
            print('File size: ' + str(round(f['size']/(1024*1024),2)) + 'MB')       

            download_file(url_file,outfile)

            print('Saved to: ' + outfile + '\n')

In [4]:
def download_asset_data(asset="kelmarsh",outfile_path="data/kelmarsh/"):
    # simplify downloading of know open data assets from zenodo
    
    if asset.lower() == "kelmarsh":
        record_id = 7212475
    elif asset.lower() == "penmanshiel":
        record_id = 5946808
    else:
        raise NameError("Zenodo record id undefined for: " + asset)
        
    download_zenodo_data(record_id,outfile_path)

In [5]:
def extract_all_data(path="data/kelmarsh/"):
    """
    Get all zip files in path and extract them
    """
    print("Extracting compressed data files")
    
    zipFiles = Path(path).rglob('*.zip')
    
    for file in zipFiles:
        with ZipFile(file) as zipfile:
            zipfile.extractall(path)

In [6]:
def get_scada_headers(SCADA_files):
    csv_params = {'index_col':0,'skiprows':2, 'nrows':4, 'delimiter':': ','header':None, 'engine':'python'}

    SCADA_headers = pd.concat((pd.read_csv(f,**csv_params).rename(columns={1:f}) for f in SCADA_files),axis=1)

    SCADA_headers.index = SCADA_headers.index.str.replace('# ','')

    SCADA_headers = SCADA_headers.transpose()

    SCADA_headers = SCADA_headers.reset_index().rename(columns={'index':'File'})
    
    return SCADA_headers

In [7]:
def get_scada_df(SCADA_headers):
    usecolumns = ['# Date and time', 'Power (kW)', 'Wind speed (m/s)','Wind direction (°)','Nacelle position (°)','Nacelle ambient temperature (°C)',
                'Blade angle (pitch position) A (°)']

    csv_params = {'index_col':'# Date and time','parse_dates':True,'skiprows':9,'usecols':usecolumns}

    SCADA_lst = list()
    for turbine in SCADA_headers['Turbine'].unique():
        SCADA_wt = pd.concat((pd.read_csv(f,**csv_params) for f in list(SCADA_headers.loc[SCADA_headers['Turbine'] == turbine]['File'])))
        #SCADA_wt.columns = column_names[1:]
        SCADA_wt['Turbine'] = turbine
        SCADA_wt.index.names = ['Timestamp']
        SCADA_lst.append(SCADA_wt.copy())

    SCADA = pd.concat(SCADA_lst)
    
    return SCADA

In [8]:
def get_curtailment_df(SCADA_headers):
    usecolumns = ['# Date and time', 'Lost Production to Curtailment (Total) (kWh)', "Lost Production to Downtime (kWh)", "Energy Export (kWh)"]

    csv_params = {'index_col':'# Date and time','parse_dates':True,'skiprows':9,'usecols':usecolumns}

    SCADA_lst = list()
    for turbine in SCADA_headers['Turbine'].unique():
        SCADA_wt = pd.concat((pd.read_csv(f,**csv_params) for f in list(SCADA_headers.loc[SCADA_headers['Turbine'] == turbine]['File'])))
        #SCADA_wt.columns = column_names[1:]
        SCADA_wt['Turbine'] = turbine
        SCADA_wt.index.names = ['Timestamp']
        SCADA_lst.append(SCADA_wt.copy())

    SCADA = pd.concat(SCADA_lst)
    
    SCADA = SCADA.groupby(['Timestamp']).sum()
    
    return SCADA

In [9]:
asset = 'kelmarsh'

In [10]:
download_asset_data(asset="kelmarsh",outfile_path="data/kelmarsh/")    

Title: Kelmarsh wind farm data
Version: 0.0.4
URL: https://zenodo.org/record/7212475
Record DOI: 10.5281/zenodo.7212475
License: CC-BY-4.0

File already exists: Kelmarsh_12.3MW_6xSenvion_MM92.kmz
File already exists: Kelmarsh_era5.zip
File already exists: Kelmarsh_Grid_3088.zip
File already exists: Kelmarsh_merra2.zip
File already exists: Kelmarsh_PMU_3089.zip
File already exists: Kelmarsh_SCADA_2016_3082.zip
File already exists: Kelmarsh_SCADA_2017_3083.zip
File already exists: Kelmarsh_SCADA_2018_3084.zip
File already exists: Kelmarsh_SCADA_2019_3085.zip
File already exists: Kelmarsh_SCADA_2020_3086.zip
File already exists: Kelmarsh_SCADA_2021_3087.zip
File already exists: Kelmarsh_WT_dataSignalMapping.csv
File already exists: Kelmarsh_WT_static.csv


In [11]:
extract_all_data(path="data/kelmarsh/")

Extracting compressed data files


In [12]:
SCADA_files = Path('data/kelmarsh/').rglob('Turbine_Data*.csv')

In [13]:
SCADA_headers = get_scada_headers(SCADA_files)

In [14]:
SCADA = get_scada_df(SCADA_headers)

In [15]:
SCADA

Unnamed: 0_level_0,Wind speed (m/s),Wind direction (°),Nacelle position (°),Power (kW),Nacelle ambient temperature (°C),Blade angle (pitch position) A (°),Turbine
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-03 00:00:00,,,,,,,Kelmarsh 1
2016-01-03 00:10:00,,,,,,,Kelmarsh 1
2016-01-03 00:20:00,,,,,,,Kelmarsh 1
2016-01-03 00:30:00,,,,,,,Kelmarsh 1
2016-01-03 00:40:00,,,,,,,Kelmarsh 1
...,...,...,...,...,...,...,...
2021-06-30 23:10:00,3.17,33.330002,34.240002,9.880000,,,Kelmarsh 6
2021-06-30 23:20:00,3.64,33.540001,34.240002,52.250000,,,Kelmarsh 6
2021-06-30 23:30:00,3.53,36.900002,34.240002,40.950001,,,Kelmarsh 6
2021-06-30 23:40:00,3.10,29.350000,34.240002,17.230000,,,Kelmarsh 6


In [16]:
curtailment = get_curtailment_df(SCADA_headers)

In [17]:
curtailment

Unnamed: 0_level_0,Energy Export (kWh),Lost Production to Downtime (kWh),Lost Production to Curtailment (Total) (kWh)
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-03 00:00:00,0.0,0.0,0.0
2016-01-03 00:10:00,0.0,0.0,0.0
2016-01-03 00:20:00,0.0,0.0,0.0
2016-01-03 00:30:00,0.0,0.0,0.0
2016-01-03 00:40:00,0.0,0.0,0.0
...,...,...,...
2021-06-30 23:10:00,59.0,0.0,0.0
2021-06-30 23:20:00,104.0,0.0,0.0
2021-06-30 23:30:00,83.0,0.0,0.0
2021-06-30 23:40:00,47.0,0.0,0.0


In [18]:
asset_data = pd.read_csv('data/kelmarsh/Kelmarsh_WT_static.csv')

In [19]:
asset_data

Unnamed: 0,Wind Farm,Title,Alternative Title,Identity,Manufacturer,Model,Rated power (kW),Hub Height (m),Rotor Diameter (m),Latitude,Longitude,Elevation (m),Country,Commercial Operations Date
0,Kelmarsh,Kelmarsh 1,KWF1,SEN 93420,Senvion,MM92,2050,78.5,92,52.400604,-0.947133,145.598,UK,15/04/2016
1,Kelmarsh,Kelmarsh 2,KWF2,SEN 93421,Senvion,MM92,2050,78.5,92,52.402551,-0.949527,156.577,UK,15/04/2016
2,Kelmarsh,Kelmarsh 3,KWF3,SEN 93422,Senvion,MM92,2050,68.5,92,52.403834,-0.94419,153.477,UK,15/04/2016
3,Kelmarsh,Kelmarsh 4,KWF4,SEN 93423,Senvion,MM92,2050,78.5,92,52.398781,-0.94115,146.313,UK,15/04/2016
4,Kelmarsh,Kelmarsh 5,KWF5,SEN 93424,Senvion,MM92,2050,78.5,92,52.402308,-0.940537,142.901,UK,15/04/2016
5,Kelmarsh,Kelmarsh 6,KWF6,SEN 93425,Senvion,MM92,2050,68.5,92,52.400687,-0.936093,135.039,UK,15/04/2016


In [20]:
usecolumns = ['# Date and time','GMS Energy Export (kWh)']

csv_params = {'index_col':'# Date and time','parse_dates':True,'skiprows':10,'usecols':usecolumns}

meter_data = pd.read_csv('data/kelmarsh/Device_Data_Kelmarsh_PMU_2016-01-09_-_2021-07-01_234.csv',**csv_params)

In [21]:
meter_data

Unnamed: 0_level_0,GMS Energy Export (kWh)
# Date and time,Unnamed: 1_level_1
2016-01-09 17:10:00,
2016-01-09 17:20:00,
2016-01-09 17:30:00,
2016-01-09 17:40:00,
2016-01-09 17:50:00,
...,...
2021-06-30 23:10:00,341.260829
2021-06-30 23:20:00,341.260829
2021-06-30 23:30:00,341.260829
2021-06-30 23:40:00,341.260829


In [24]:
asset_json = {
    
  "asset": {
    "elevation": "Elevation (m)",
    "hub_height": "Hub Height (m)",
    "id": "Title",
    "latitude": "Latitude",
    "longitude": "Longitude",
    "rated_power": "Rated power (kW)",
    "rotor_diameter": "Rotor Diameter (m)"
  },
    
  "curtail": {
    "availability": "Lost Production to Downtime (kWh)",
    "curtailment": "Lost Production to Curtailment (Total) (kWh)",
    "frequency": "10T",
    "net_energy": "Energy Export (kWh)",
    "time": "# Date and time"
  },
    
  "latitude": asset_data['Latitude'].mean(),
  "longitude": asset_data['Longitude'].mean(),
    
  "meter": {
    "energy": "GMS Energy Export (kWh)",
    "time": "# Date and time"
  },
    
  "reanalysis": {
      
    "era5": {
      "frequency": "H",
      "surface_pressure": "surf_pres",
      "temperature": "t_2m",
      "time": "datetime",
      "windspeed_u": "u_100",
      "windspeed_v": "v_100"
    },
      
    "merra2": {
      "frequency": "H",
      "surface_pressure": "surface_pressure",
      "temperature": "temp_10m",
      "time": "datetime",
      "windspeed_u": "u_50",
      "windspeed_v": "v_50"
    }
  },
    
  "scada": {
    "frequency": "10T",
    "id": "Turbine",
    "pitch": "Blade angle (pitch position) A (°)",
    "power": "Power (kW)",
    "temperature": "Nacelle ambient temperature (°C)",
    "time": "# Date and time",
    "wind_direction": "Wind direction (°)",
    "windspeed": "Wind speed (m/s)"
  }
}