For training we require
- turb_info
- obs_cf
- sim_cf (unc_cf or cor_cf)
- sim_ws (unc_ws or cor_ws)

for test we require
- bc_factors
- turb_info

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utm
from calendar import monthrange

### (1) Producing `turb_info`, a file consisting of all features of a turbine.
For Denmark's data there had to be a lot of manual manipulation of the excel file. I had to manually match the turbines that excist in the power curves file, with Denmarks naming convention then match it to the ID's. anlaeg.xlsx is the raw file and match_turb_dk.xlsx is where the matching is done.

After this we are required to fill in missing turbine matches and also convert the coordinate system.

In [2]:
df = pd.read_excel('../ninja-reimplementation/data/wind_data/DK/match_turb_dk.xlsx')
columns = ['Turbine identifier (GSRN)','Capacity (kW)','X (east) coordinate\nUTM 32 Euref89','Y (north) coordinate\nUTM 32 Euref89','Hub height (m)', 'Date of original connection to grid', 'turb_match']
df = df[columns]
rename_col = ['gsrn_id','capacity','x_east_32','y_north_32','height', 'date', 'model']
df.columns = rename_col
df = df.dropna()

# matching closest models to capacity
metadata = pd.read_csv('../ninja-reimplementation/data/turbine_info/models.csv')
metadata = metadata.sort_values('capacity')

df['model'][df['model'] == 0] = np.nan
df['capacity'] = df['capacity'].astype(int)
df = df.sort_values('capacity').reset_index(drop=True)
df.loc[df['model'].isna(), 'model'] = pd.merge_asof(df, metadata, left_on=["capacity"], right_on=["capacity"], direction="nearest")['model_y']

# convert coordinate system
def rule(row):
    lat, lon = utm.to_latlon(row["x_east_32"], row["y_north_32"], 32, 'W')
    return pd.Series({"lat": lat, "lon": lon})

df = df.merge(df.apply(rule, axis=1), left_index= True, right_index= True)
df = df[['gsrn_id','capacity','lat','lon','height', 'date', 'model']]
df['gsrn_id'] = df['gsrn_id'].astype(str)
turb_info = df.drop(df[df['height'] < 1].index).reset_index(drop=True)
turb_info

Unnamed: 0,gsrn_id,capacity,lat,lon,height,date,model
0,571313103103601207,10,57.451597,9.840847,21.0,2014-11-06,Bonus.B23.150
1,570715000001496560,10,56.635184,8.495377,18.0,2015-12-08,Bonus.B23.150
2,570715000001496546,10,56.637878,8.499038,21.4,2015-12-30,Bonus.B23.150
3,570715000001488442,10,56.655522,8.541943,21.4,2015-11-21,Bonus.B23.150
4,570715000001451262,10,56.606620,8.524307,18.2,2015-05-15,Bonus.B23.150
...,...,...,...,...,...,...,...
5614,570715000001543059,8400,54.985531,13.014996,104.0,2021-03-25,Siemens.Gamesa.SG.8.0.167
5615,570715000001616975,8600,57.056983,8.883934,120.5,2018-10-05,MHI.Vestas.V164.8400
5616,570715000001663276,9500,57.073458,8.883321,130.0,2020-01-28,MHI.Vestas.V164.8400
5617,570715000001661395,10000,57.046510,8.883422,140.0,2020-02-27,Siemens.Gamesa.SG.10.0.193


### (3) Matching observational data with turbines/farms in `turb_info`

In [3]:
def prep_obs_denmark(year_star, year_end):
    """
    Loading the observation data for the training years, and cleaning it.
    """
    # Load observation data and slice the observed CF for chosen year
    appended_data = []
    for i in range(year_star, year_end+1):
        data = pd.read_excel('../ninja-reimplementation/data/wind_data/DK/observation/Denmark_'+str(i)+'.xlsx')
        data = data.iloc[3:,np.r_[0:1, 3:15]] # the slicing done here is file dependent please consider this when other files are used
        data.columns = ['gsrn_id','1','2','3','4','5','6','7','8','9','10','11','12']
        data['gsrn_id'] = data['gsrn_id'].astype(str)
        data = data.reset_index(drop=True)
        data['year'] = i

        appended_data.append(data[:-1])

    obs_gen = pd.concat(appended_data).reset_index(drop=True)
    obs_gen.columns = [f'obs_{i}' if i not in ['gsrn_id', 'year'] else f'{i}' for i in obs_gen.columns]
    
    return obs_gen

obs_gen = prep_obs_denmark(2015,2019)

# converting obs_gen into obs_cf by turning power into capacity factor
# making sure turb info only consists of turbines we have observed
turb_info = turb_info.loc[turb_info['gsrn_id'].isin(obs_gen['gsrn_id'])].reset_index(drop=True)
turb_info

df = pd.merge(obs_gen, turb_info[['gsrn_id', 'capacity']],  how='left', on=['gsrn_id'])
df = df.dropna().reset_index(drop=True)

def daysDuringMonth(yy, m):
    result = []    
    [result.append(monthrange(y, m)[1]) for y in yy]        
    return result

for i in range(1,13):
    df['obs_'+str(i)] = df['obs_'+str(i)]/(((daysDuringMonth(df.year, i))*df['capacity'])*24)

df['cf_mean'] = df.iloc[:,1:13].mean(axis=1)
df = df.drop(df[df['cf_mean'] <= 0.01].index)
obs_cf = df.drop(['capacity','cf_mean'], axis=1).reset_index(drop=True)

obs_cf

Unnamed: 0,gsrn_id,obs_1,obs_2,obs_3,obs_4,obs_5,obs_6,obs_7,obs_8,obs_9,obs_10,obs_11,obs_12,year
0,570714700000000027,0.264928,0.208611,0.157587,0.138852,0.1542,0.099599,0.129444,0.135741,0.160123,0.148698,0.209259,0.286195,2015
1,570714700000000034,0.264928,0.208611,0.157587,0.138852,0.1542,0.099599,0.129444,0.135741,0.160123,0.148698,0.209259,0.286195,2015
2,570714700000000041,0.24537,0.161771,0.14744,0.179567,0.091983,0.110428,0.142417,0.103468,0.135271,0.188461,0.216014,0.225862,2015
3,570714700000000058,0.24537,0.161771,0.14744,0.179567,0.091983,0.110428,0.142417,0.103468,0.135271,0.188461,0.216014,0.225862,2015
4,570714700000000065,0.24537,0.161771,0.14744,0.179567,0.091983,0.110428,0.142417,0.103468,0.135271,0.188461,0.216014,0.225862,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25182,571313174116016241,0.097581,0.085714,0.11828,0.092222,0.064113,0.04125,0.072984,0.039382,0.099722,0.05672,0.092083,0.106183,2019
25183,571313174116025533,0.07414,0.031429,0.062634,0.04,0.034409,0.019222,0.019892,0.006774,0.037778,0.024946,0.036556,0.052849,2019
25184,571313174116253585,0.267465,0.194851,0.327062,0.138036,0.213049,0.147462,0.19441,0.092608,0.23151,0.15308,0.14344,0.257954,2019
25185,571313174116620486,0.087366,0.066845,0.129731,0.057944,0.109785,0.053556,0.102903,0.037151,0.1215,0.068118,0.057611,0.098817,2019


In [6]:
turb_info_test = turb_info.loc[turb_info['gsrn_id'].isin(obs_cf[obs_cf['year'] == 2019]['gsrn_id'])].reset_index(drop=True)
turb_info_test

Unnamed: 0,gsrn_id,capacity,lat,lon,height,date,model
0,571313103103601207,10,57.451597,9.840847,21.0,2014-11-06,Bonus.B23.150
1,570715000001496560,10,56.635184,8.495377,18.0,2015-12-08,Bonus.B23.150
2,570715000001496546,10,56.637878,8.499038,21.4,2015-12-30,Bonus.B23.150
3,570715000001488442,10,56.655522,8.541943,21.4,2015-11-21,Bonus.B23.150
4,570715000001451262,10,56.606620,8.524307,18.2,2015-05-15,Bonus.B23.150
...,...,...,...,...,...,...,...
5369,570715000001613844,8300,55.701501,7.683107,105.0,2018-12-23,MHI.Vestas.V164.8400
5370,570715000001613851,8300,55.679050,7.687010,105.0,2018-12-23,MHI.Vestas.V164.8400
5371,570715000001613813,8300,55.731253,7.684685,105.0,2018-12-23,MHI.Vestas.V164.8400
5372,570715000001613769,8300,55.661573,7.668805,105.0,2018-12-23,MHI.Vestas.V164.8400
