# Training method

This training method calculates BC factors at an individual turbine level, which is essentially the highest resolution then is averaged.

In [12]:
import numpy as np
import pandas as pd
import xarray as xr
import dask.dataframe as dd
from scipy import interpolate
from sklearn.cluster import KMeans

import itertools
import time
from calendar import monthrange
import matplotlib.pyplot as plt
import seaborn as sns

# from vwf.simulation import simulate_wind
from vwf.preprocessing import (
    prep_era5,
    prep_obs,
    prep_obs_test,
    prep_merra2_method_1
)

pd.options.mode.chained_assignment = None  # default='warn'

In [8]:
def add_times(data):
    data['year'] = pd.DatetimeIndex(data['time']).year
    data['month'] = pd.DatetimeIndex(data['time']).month
    data.insert(1, 'year', data.pop('year'))
    data.insert(2, 'month', data.pop('month'))
    return data

def simulate_wind_speed(reanal_data, turb_info):
    reanal_data = reanal_data.assign_coords(
        height=('height', turb_info['height'].unique()))
    
    # calculating wind speed from reanalysis dataset variables
    ws = reanal_data.wnd100m * (np.log(reanal_data.height/ reanal_data.roughness) / np.log(100 / reanal_data.roughness))
    
    # creating coordinates to spatially interpolate to
    lat =  xr.DataArray(turb_info['lat'], dims='turbine', coords={'turbine':turb_info['ID']})
    lon =  xr.DataArray(turb_info['lon'], dims='turbine', coords={'turbine':turb_info['ID']})
    height =  xr.DataArray(turb_info['height'], dims='turbine', coords={'turbine':turb_info['ID']})

    # spatial interpolating to turbine positions
    sim_ws = ws.interp(
            x=lon, y=lat, height=height,
            kwargs={"fill_value": None})
    
    return sim_ws
    
# def speed_to_power(sim_ws, turb_info, powerCurveFile): 
#     # identifying the model assigned to this turbine ID to access the power curve
#     # and covert the speed into power
#     x = powerCurveFile['data$speed']
#     turb_name = turb_info.loc[turb_info['ID'] == sim_ws.turbine.data, 'model']
#     y = powerCurveFile[turb_name].to_numpy().flatten()
#     f = interpolate.Akima1DInterpolator(x, y)
#     return f(sim_ws.data)


def simulate_wind_train(turb_info, reanal_data, powerCurveFile, *args):
    scalar, offset = args

    # simulate wind speed and apply bias correction factors
    sim_ws = simulate_wind_speed(reanal_data, turb_info)
    sim_ws = (sim_ws * scalar) + offset
    sim_ws = sim_ws.where(raw_ws > 0 , 0)
    sim_ws = sim_ws.where(raw_ws < 40 , 40)

    # converting wind speed to power, by inteporlating the manufacturer power curve
    x = powerCurveFile['data$speed']
    turb_name = turb_info.loc[turb_info['ID'] == sim_ws.turbine.data, 'model']
    y = powerCurveFile[turb_name].to_numpy().flatten()
    f = interpolate.Akima1DInterpolator(x, y)
    
    sim_cf = f(sim_ws.data)
    
    return np.mean(sim_cf)


def find_offset(row,turb_info,reanal_data,powerCurveFile):
    myOffset = 0
    
    # decide our initial search step size
    stepSize = -0.64
    if (row.sim > row.obs):
        stepSize = 0.64
        
    # Stop when step-size is smaller than our power curve's resolution
    while np.abs(stepSize) > 0.002:
        # If we are still far from energytarget, increase stepsize
        myOffset += stepSize
        
        # calculate the mean simulated CF using the new offset
        mean_cf = simulate_wind_train(
            turb_info[turb_info["ID"]==row.ID], 
            reanal_data.sel(time=slice(str(row.year)+'-'+str(row.month)+'-01', str(row.year)+'-'+str(row.month)+'-'+str(monthrange(row.year, row.month)[1]))), 
            powerCurveFile, 
            row.scalar, 
            myOffset
        )

        # if we have overshot our target, then repeat, searching the other direction
        # ((guess < target & sign(step) < 0) | (guess > target & sign(step) > 0))
        if mean_cf != 0:
            sim = mean_cf
            if np.sign(sim - row.obs) == np.sign(stepSize):
                stepSize = -stepSize / 2
            # If we have reached unreasonable places, stop
            if myOffset < -20 or myOffset > 20:
                break
        elif mean_cf == 0:
            myOffset = 0
            break
    
    return myOffset


#### processing BCF
def format_bc_factors(num_clu, turb_info):
    # cluster factors first
    lat = turb_info['lat']
    lon = turb_info['lon']
    df = pd.DataFrame(list(zip(lat, lon)),
                    columns =['lat', 'lon'])

    # create kmeans model/object
    kmeans = KMeans(
        init="random",
        n_clusters = num_clu,
        n_init = 10,
        max_iter = 300,
        random_state = 42
    )
    kmeans.fit(df)
    turb_info['cluster'] = kmeans.labels_
    df = pd.read_csv('data/turbine_info/all_bias_results.csv')
    turb_info['ID'] = turb_info['ID'].astype(str)
    df['ID'] = df['ID'].astype(str)
    df = pd.merge(df[['ID','year','month','scalar','offset']], turb_info[['cluster', 'ID']],  how='left', on='ID')
    df = df.groupby(['cluster', 'month'], as_index=False).agg({'scalar': 'mean', 'offset': 'mean'})
    
    sea = []
    for i in range(len(df)):
        if (df.month[i] == 3) or (df.month[i] == 4) or (df.month[i] == 5):
            sea.append('spring')
        if (df.month[i] == 8) or (df.month[i] == 6) or (df.month[i] == 7):
            sea.append('summ')
        if (df.month[i] == 11) or (df.month[i] == 9) or (df.month[i] == 10):
            sea.append('autum')
        if (df.month[i] == 12) or (df.month[i] == 1) or (df.month[i] == 2):
            sea.append('wint')
    df['season'] = sea
    
    # ADD IN A COLUMN REPRESENTING THE BI-MONTHLY DIVISION
    two = []
    for i in range(len(df)):
        if (df.month[i] == 1) or (df.month[i] == 2):
            two.append('01')
        if (df.month[i] == 3) or (df.month[i] == 4):
            two.append('02')
        if (df.month[i] == 5) or (df.month[i] == 6):
            two.append('03')
        if (df.month[i] == 7) or (df.month[i] == 8):
            two.append('04')
        if (df.month[i] == 9) or (df.month[i] == 10):
            two.append('05')
        if (df.month[i] == 11) or (df.month[i] == 12):
            two.append('06')
    df['two_month'] = two

    return df, turb_info


def closest_cluster(clus_info, turb_info):
    """
    Assign turbines not found in training data to closest cluster.
    """
    # making sure ID column dtype is same   
    clus_info['ID'] = clus_info['ID'].astype(str)
    turb_info['ID'] = turb_info['ID'].astype(str)
    
    avg = clus_info.groupby(['cluster'], as_index=False)[['lat','lon']].mean()
    turb_info = pd.DataFrame.merge(clus_info[['ID','cluster']], turb_info, on='ID', how='right')

    for i in range(len(turb_info)):
        if np.isnan(turb_info.cluster[i]) == True:
            # Find the cluster center closest to the new turbine
            # - find smallest distance between the new turbine and cluster centers
            indx = np.argmin(np.sqrt((avg.lat.values - turb_info.lat[i])**2 + (avg.lon.values - turb_info.lon[i])**2))
            turb_info.cluster[i] = avg.cluster[indx]

    turb_info = turb_info.reset_index(drop=True)

    return turb_info


#### TEST
def speed_to_power(sim_ws, turb_info, powerCurveFile): 
    sim_cf = sim_ws.copy()
    
    x = powerCurveFile['data$speed']
    for i in range(2, len(sim_cf.columns)+1):          
        speed_single = sim_cf.iloc[:,i-1]
        turb_name = turb_info.loc[turb_info['ID'] == speed_single.name, 'model']           
        y = powerCurveFile[turb_name].to_numpy().flatten()
        f = interpolate.Akima1DInterpolator(x, y)
        sim_cf.iloc[:,i-1] = f(speed_single)
    return sim_cf

def simulate_wind_test(reanal_data, turb_info, powerCurveFile, time_res='month',train=False, bias_correct=False, *args):

    sim_ws = simulate_wind_speed(reanal_data, turb_info)
    unc_ws = sim_ws.to_pandas().reset_index()

    if bias_correct == False:
        unc_cf = speed_to_power(unc_ws, turb_info[['ID','model']], powerCurveFile)
        return unc_ws, unc_cf
        
    else:
        bc_factors = args[0] # reading in bias correction factors
        unc_ws = unc_ws.melt(id_vars=["time"], # adding in turbine ID for merging
            var_name="ID", 
            value_name="ws")
        unc_ws = add_times(unc_ws)

        
        # unc_ws['ID'] = unc_ws['ID'].astype(str)
        # turb_info['ID'] = turb_info['ID'].astype(str)
        # bc_factors['ID'] = bc_factors['ID'].astype(str)
        
        # matching the correct resolution bias correction factors
        unc_ws = pd.merge(unc_ws, turb_info[['ID', 'cluster']], on='ID', how='left')  
        if time_res == 'year':
            time_factors = bc_factors.groupby(['cluster'], as_index=False).agg({'scalar': 'mean', 'offset': 'mean'})
            unc_ws = pd.merge(unc_ws, time_factors[['cluster','scalar', 'offset']],  how='left', on=['cluster'])
        
        else:        
            unc_ws = pd.merge(unc_ws, bc_factors[['cluster', 'month','two_month','season']],  how='left', on=['cluster', 'month'])
            time_factors = bc_factors.groupby(['cluster',time_res], as_index=False).agg({'scalar': 'mean', 'offset': 'mean'})
            unc_ws = pd.merge(unc_ws, time_factors[['cluster', time_res, 'scalar', 'offset']],  how='left', on=['cluster', time_res])

        # applying the bias correction factors to wind speed
        # cor_ws['speed'] = (unc_ws.ws + unc_ws.offset) * gen_speed.scalar # equation 1 
        unc_ws['ws'] = (unc_ws.ws * unc_ws.scalar) + unc_ws.offset # equation 2
        cor_ws = unc_ws.pivot(index=['time'], columns='ID', values='ws').reset_index()
        cor_cf = speed_to_power(cor_ws,turb_info[['ID','model']], powerCurveFile)
        return cor_ws, cor_cf

In [6]:
year_star = 2015 # start year of training period
year_end = 2019 # end year of training period
year_test = 2020 # year you wish to receive a time series for

powerCurveFileLoc = 'data/turbine_info/Wind Turbine Power Curves.csv'
powerCurveFile = pd.read_csv(powerCurveFileLoc)

era5_train = prep_era5(True)
obs_cf, turb_info_train = prep_obs("DK", year_star, year_end)

Number of turbines before preprocessing:  5682
Number of turbines used in training:  3712


In [None]:
# prep training data
unc_ws_train, unc_cf_train = simulate_wind(era5_train, turb_info_train, powerCurveFile)
unc_cf = unc_cf_train.groupby(pd.Grouper(key='time',freq='M')).mean().reset_index()
unc_cf = unc_cf.melt(id_vars=["time"], 
                var_name="ID", 
                value_name="sim")
unc_cf = add_times(unc_cf)
unc_cf['ID'] = unc_cf['ID'].astype(str)
unc_cf['month'] = unc_cf['month'].astype(int)
unc_cf['year'] = unc_cf['year'].astype(int)
obs_cf2 = obs_cf
obs_cf2.columns = ['ID','1','2','3','4','5','6','7','8','9','10','11','12','year']
obs_cf2 = obs_cf2.melt(id_vars=["ID", "year"], 
                var_name="month", 
                value_name="obs")
obs_cf2['ID'] = obs_cf2['ID'].astype(str)
obs_cf2['month'] = obs_cf2['month'].astype(int)
obs_cf2['year'] = obs_cf2['year'].astype(int)
train_cf = pd.merge(unc_cf, obs_cf2, on=['ID','month', "year"], how='left')
scalar_alpha = 0.6
scalar_beta = 0.2
train_cf['scalar'] = (scalar_alpha * (train_cf['obs']/train_cf['sim'])) + scalar_beta
train_cf = train_cf.drop(['time'], axis=1).reset_index(drop=True)

train_cf.head(5)

In [None]:
%%time
def find_offset_parallel(df):
    return df.apply(find_offset, args=(turb_info_train,era5_train,powerCurveFile), axis=1)

ddf = dd.from_pandas(train_cf, npartitions=40)
ddf["offset"] = ddf.map_partitions(find_offset_parallel, meta=('offset', 'f8'))
ddf.to_csv('data/turbine_info/all_bias_results.csv', single_file=True, compute_kwargs={'scheduler':'processes'})

## Test

In [4]:
era5_test = prep_era5()
turb_info_test = prep_obs_test("DK", year_test)

In [11]:
time_res_list = ['year', 'season', 'two_month', 'month'] 
cluster_list = [3000,3711]#[1,2,3,4,5,6,8,10,15,20,30,50,100,150,200,300,400,500,750,1000,1500,2000,3000,3711] # [1,2,3,5,10,15,20,30,50,100,150,200,300,400]

unc_ws, unc_cf = simulate_wind_test(era5_test, turb_info_test, powerCurveFile)
unc_ws.to_csv('data/results/new/'+str(year_test)+'_unc_ws.csv', index = None)
unc_cf.to_csv('data/results/new/'+str(year_test)+'_unc_cf.csv', index = None)

for num_clu, time_res in itertools.product(cluster_list, time_res_list):
    # producing corrected results
    print("Test for ", year_test, " using ", num_clu, " clusters with time resolution: ", time_res, " is taking place.")
    start_time = time.time()

    bc_factors, clus_info = format_bc_factors(num_clu, turb_info_train)
    clus_info_test = closest_cluster(clus_info, turb_info_test)

    cor_ws, cor_cf = simulate_wind_test(era5_test, clus_info_test, powerCurveFile, time_res, False, True, bc_factors)
    
    cor_ws.to_csv('data/results/new/'+str(year_test)+'_'+time_res+'_'+str(num_clu)+'_cor_ws.csv', index = None)
    cor_cf.to_csv('data/results/new/'+str(year_test)+'_'+time_res+'_'+str(num_clu)+'_cor_cf.csv', index = None)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Results completed and saved. Elapsed time: {:.2f} seconds".format(elapsed_time))
    print(" ")

Test for  2020  using  3000  clusters with time resolution:  year  is taking place.
Results completed and saved. Elapsed time: 11.21 seconds
 
Test for  2020  using  3000  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 11.86 seconds
 
Test for  2020  using  3000  clusters with time resolution:  two_month  is taking place.
Results completed and saved. Elapsed time: 11.26 seconds
 
Test for  2020  using  3000  clusters with time resolution:  month  is taking place.
Results completed and saved. Elapsed time: 11.29 seconds
 
Test for  2020  using  3711  clusters with time resolution:  year  is taking place.
Results completed and saved. Elapsed time: 11.67 seconds
 
Test for  2020  using  3711  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 12.20 seconds
 
Test for  2020  using  3711  clusters with time resolution:  two_month  is taking place.
Results completed and saved. Elapsed time: 

In [None]:
# cor = cor_cf
# cor = cor.melt(id_vars=["time"], # adding in turbine ID for merging
#         var_name="ID", 
#         value_name="cor")
# cor_month = cor.groupby([pd.Grouper(key='time',freq='M')])['cor'].mean().reset_index()
    
# unc = unc_cf
# unc = unc.melt(id_vars=["time"], # adding in turbine ID for merging
#     var_name="ID", 
#     value_name="unc")
# unc_month = unc.groupby([pd.Grouper(key='time',freq='M')])['unc'].mean().reset_index()

# obs = pd.read_csv('data/wind_data/DK/obs_cf_test.csv', parse_dates=['time'])
# obs = obs.melt(id_vars=["time"], # adding in turbine ID for merging
#     var_name="ID", 
#     value_name="obs")
# obs_month = obs.groupby([pd.Grouper(key='time',freq='M')])['obs'].mean().reset_index()

# cf_month = obs_month.merge(unc_month,on=['time']).merge(cor_month,on=['time'])
# cf_month.columns = ["time", 'obs', 'unc', "cor"]
# cf_month["time"] = cf_month["time"].dt.month
# cf_month = cf_month.melt(id_vars=["time"],
#             var_name="model", 
#             value_name="CF")


# sns.lineplot(
#     data = cf_month,
#     x="time",
#     y="CF",
#     hue="model",
#     style="model",
#     legend = True,
# )

### Attempted to improve speed of speed to power with dask

In [None]:
# def f1(column, turb_info):
#     turb_name = turb_info.loc[turb_info['ID'] == column.columns, 'model']
#     x = powerCurveFile['data$speed']
#     y = powerCurveFile[turb_name]
#     f = interpolate.Akima1DInterpolator(x, y)
#     return df.asign(f(column))

# def f2(df):
#     return df.apply(f1, args=(turb_info_test), axis=0)

# try_cf = unc_ws.iloc[:,:1]
# ddf = dd.from_pandas(unc_ws.iloc[:,1:], npartitions=40)
# p = ddf.map_partitions(f2, meta=(None, 'f8'))

In [None]:
# %%time
# p.compute(scheduler='processes')