# Training method v2

This training method will attempt to simulate the CF of all turbines in the designated spatial-temporal cluster which will be averaged to optimise the BC factors, this will better use individual power curves instead of assuming for a cluster.

training data:
turb info (lat, lon, capacity, height, model)
cluster label for the turbine
year and time res of simulated CF and obs CF
the scalar calculated at the cluster+time res level

so i need to produce an unsimulated CF, average that into the inputted time res per year and num clusters
produce the training data described above
farm offset can be run with dask like I did for v1 but  will select timeperiod for reanalaysis based on the time_res input
, this would require me to change how speed is calcuulated and coverted to power and this should take inspo from my test functions as this is in a format to do multiple at a time.



In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import dask.dataframe as dd
from scipy import interpolate
from sklearn.cluster import KMeans
import utm


import itertools
import time
from calendar import monthrange
import matplotlib.pyplot as plt
import seaborn as sns

# from vwf.simulation import simulate_wind
from vwf.extras import add_times
# from vwf.preprocessing import (
#     prep_era5,
#     prep_obs,
#     prep_obs_test,
#     prep_merra2_method_1
# )

pd.options.mode.chained_assignment = None  # default='warn'

## Preprocessing

In [13]:

def prep_obs(country, year_star, year_end):
    
    if country == "DK":
        """
        For Denmark's data there had to be a lot of manual manipulation of the excel file. 
        I had to manually match the turbines that exist in the power curves file, with Denmarks naming convention then match it to the ID's. 
        anlaeg.xlsx is the raw file and match_turb_dk.xlsx is where the matching is done.
        After this we are required to fill in missing turbine matches and also convert the coordinate system.
        We also produce the observational data which is again manually seperated into yearly sheets from a megasheet for the years we desire.
        As the observational data is power output we converted that to capacity factor with the matched turbines.
        the ID's here are the gsrn ID
        """
        ##############################
        # producing turb_info
        ##############################
        # reading in the messy denmark turbine info that we have matched
        df = pd.read_excel('data/wind_data/DK/raw/match_turb_dk.xlsx')
        columns = ['Turbine identifier (GSRN)','Capacity (kW)','X (east) coordinate\nUTM 32 Euref89','Y (north) coordinate\nUTM 32 Euref89','Hub height (m)', 'Date of original connection to grid', 'turb_match']
        df = df[columns]
        rename_col = ['ID','capacity','x_east_32','y_north_32','height', 'date', 'model']
        df.columns = rename_col
        df = df.dropna()

        # matching modelless turbines with closest model via capacity
        metadata = pd.read_csv('data/turbine_info/models.csv')
        metadata = metadata.sort_values('capacity')

        df['model'][df['model'] == 0] = np.nan
        df['capacity'] = df['capacity'].astype(int)
        df = df.sort_values('capacity').reset_index(drop=True)
        df.loc[df['model'].isna(), 'model'] = pd.merge_asof(df, metadata, left_on=["capacity"], right_on=["capacity"], direction="nearest")['model_y']

        # convert coordinate system
        def rule(row):
            lat, lon = utm.to_latlon(row["x_east_32"], row["y_north_32"], 32, 'W')
            return pd.Series({"lat": lat, "lon": lon})

        df = df.merge(df.apply(rule, axis=1), left_index= True, right_index= True)
        df = df[['ID','capacity','lat','lon','height', 'date', 'model']]
        df['ID'] = df['ID'].astype(str)
        print("Number of turbines before preprocessing: ", len(df))
        turb_info = df.drop(df[df['height'] < 1].index).reset_index(drop=True)


        ##############################
        # producing obs_cf
        ##############################

        # Load observation data and slice the observed CF for chosen years
        appended_data = []
        for i in range(year_star, year_end+1): # change this back to +1 when i dont need 2020 obs
            data = pd.read_excel('data/wind_data/DK/observation/Denmark_'+str(i)+'.xlsx')
            data = data.iloc[3:,np.r_[0:1, 3:15]] # the slicing done here is file dependent please consider this when other files are used
            data.columns = ['ID','1','2','3','4','5','6','7','8','9','10','11','12']
            data['ID'] = data['ID'].astype(str)
            data = data.reset_index(drop=True)
            data['year'] = i

            appended_data.append(data[:-1])

        obs_gen = pd.concat(appended_data).reset_index(drop=True)
        obs_gen.columns = [f'obs_{i}' if i not in ['ID', 'year'] else f'{i}' for i in obs_gen.columns]

        # converting obs_gen into obs_cf by turning power into capacity factor
        df = pd.merge(obs_gen, turb_info[['ID', 'capacity']],  how='left', on=['ID'])
        df = df.dropna().reset_index(drop=True)

        def daysDuringMonth(yy, m):
            result = []    
            [result.append(monthrange(y, m)[1]) for y in yy]        
            return result

        for i in range(1,13):
            df['obs_'+str(i)] = df['obs_'+str(i)]/(((daysDuringMonth(df.year, i))*df['capacity'])*24)

        df = df.drop(['capacity'], axis=1).reset_index(drop=True)
        df['cf_max'] = df.iloc[:,1:13].max(axis=1)
        df = df.drop(df[df['cf_max'] > 1].index)
        df['cf_min'] = df.iloc[:,1:13].min(axis=1)
        df = df.drop(df[df['cf_min'] <= 0.01].index)
        df['cf_mean'] = df.iloc[:,1:13].mean(axis=1)
        df = df.drop(df[df['cf_mean'] <= 0.01].index)
        obs_cf = df.drop(['cf_mean', 'cf_max', 'cf_min'], axis=1).reset_index(drop=True)

        obs_cf = obs_cf.loc[obs_cf['ID'].isin(turb_info['ID'])].reset_index(drop=True)
        obs_cf = obs_cf[obs_cf.groupby('ID').ID.transform('count') == ((year_end-year_star)+1)].reset_index(drop=True)
        obs_cf.columns = ['ID','1','2','3','4','5','6','7','8','9','10','11','12','year']
        obs_cf = obs_cf.melt(id_vars=["ID", "year"], 
                        var_name="month", 
                        value_name="obs")
        # obs_cf.to_csv('data/wind_data/DK/obs_cf_train.csv', index = None)
        
        turb_info = turb_info.loc[turb_info['ID'].isin(obs_cf['ID'])].reset_index(drop=True)
        # turb_info.to_csv('data/wind_data/DK/turb_info_train.csv', index = None)
        
        print("Number of turbines used in training: ", len(turb_info))
        return obs_cf, turb_info

# def prep_era5(year_star, year_end, train=False):
def prep_era5(train=False):
    """
    Reading a saved ERA5 file with 100m wind speeds and fsr.
    changing names and converting wind speed components into wind speed.
    """
    # Load the corresponding raw ERA5 file
    if train == True:
        ds = xr.open_mfdataset('data/reanalysis/train/*.nc')
    else:
        ds = xr.open_mfdataset('data/reanalysis/test/*.nc')
    ds = ds.compute() # this allows it to not be dask chunks
    
    ds["wnd100m"] = np.sqrt(ds["u100"] ** 2 + ds["v100"] ** 2).assign_attrs(
        units=ds["u100"].attrs["units"], long_name="100 metre wind speed"
    )
    
    ds = ds.drop_vars(["u100", "v100"])
    ds = ds.rename({"fsr": "roughness"})
    
    # turn hourly data into daily for speed of existing code
    ds = ds.resample(time='1D').mean()
    try:
        ds = ds.rename({"longitude": "lon", "latitude": "lat"})
    except:
        pass
        
    ds = ds.assign_coords(
        lon=np.round(ds.lon.astype(float), 5), lat=np.round(ds.lat.astype(float), 5)
    )
    return ds

def add_time_res(df):
    df.loc[df['month'] == 1, ['bimonth','season']] = ['1/6', 'winter']
    df.loc[df['month'] == 2, ['bimonth','season']] = ['1/6', 'winter']
    df.loc[df['month'] == 3, ['bimonth','season']] = ['2/6', 'spring']
    df.loc[df['month'] == 4, ['bimonth','season']] = ['2/6', 'spring']
    df.loc[df['month'] == 5, ['bimonth','season']] = ['3/6', 'spring']
    df.loc[df['month'] == 6, ['bimonth','season']] = ['3/6', 'summer']
    df.loc[df['month'] == 7, ['bimonth','season']] = ['4/6', 'summer']
    df.loc[df['month'] == 8, ['bimonth','season']] = ['4/6', 'summer']
    df.loc[df['month'] == 9, ['bimonth','season']] = ['5/6', 'autumn']
    df.loc[df['month'] == 10, ['bimonth','season']] = ['5/6', 'autumn']
    df.loc[df['month'] == 11, ['bimonth','season']] = ['6/6', 'autumn']
    df.loc[df['month'] == 12, ['bimonth','season']] = ['6/6', 'winter']
    df['yearly'] = 'year'
    return df

    
def match_generation_data(reanalysis, obs_cf, turb_info, powerCurveFile):
    
    sim_ws, sim_cf = simulate_wind(reanalysis, turb_info, powerCurveFile)
    sim_cf = sim_cf.groupby(pd.Grouper(key='time',freq='M')).mean().reset_index()
    sim_cf = sim_cf.melt(id_vars=["time"], 
                    var_name="ID", 
                    value_name="sim")
    sim_cf = add_times(sim_cf)
    sim_cf = add_time_res(sim_cf)


    sim_cf['ID'] = sim_cf['ID'].astype(str)
    sim_cf['month'] = sim_cf['month'].astype(int)
    sim_cf['year'] = sim_cf['year'].astype(int)
    obs_cf['ID'] = obs_cf['ID'].astype(str)
    obs_cf['month'] = obs_cf['month'].astype(int)
    obs_cf['year'] = obs_cf['year'].astype(int)
    
    gen_cf = pd.merge(sim_cf, obs_cf, on=['ID', 'month', 'year'], how='left')
    gen_cf = gen_cf.drop(['time'], axis=1).reset_index(drop=True)
    
    return gen_cf

## Training

In [3]:
def calculate_scalar(time_res, gen_data):
    # scalar_alpha = 0.6
    # scalar_beta = 0.2

    # if time_res == 'year':
    #     bias_data = gen_data.groupby(['year','cluster'], as_index=False)[['obs','sim']].mean()
    #     bias_data['scalar'] = (scalar_alpha * (bias_data['obs'] / bias_data['sim'])) + scalar_beta
    #     bias_data['time_slice'] = 'year'
        
    # else:
    bias_data = gen_data.groupby([time_res, 'cluster', 'year'])[['obs','sim']].mean()
    # bias_data['scalar'] = (scalar_alpha * (bias_data['obs'] / bias_data['sim'])) + scalar_beta
    bias_data['scalar'] = bias_data['obs'] / bias_data['sim']
    bias_data = bias_data.reset_index()
    bias_data.columns = ['time_slice', 'cluster', 'year', 'obs', 'sim', 'scalar']
        
    return bias_data[['year', 'time_slice', 'cluster', 'obs', 'sim', 'scalar']]


def cluster_turbines(num_clu, turb_info):
    # generating the cluster labels 
    kmeans = KMeans(
        init="random",
        n_clusters = num_clu,
        n_init = 10,
        max_iter = 300,
        random_state = 42
    )
    
    lat = turb_info['lat']
    lon = turb_info['lon']
    df = pd.DataFrame(list(zip(lat, lon)), columns =['lat', 'lon'])
    kmeans.fit(df)
    turb_info['cluster'] = kmeans.labels_
    turb_info.to_csv('data/results/new_factors/clusters/clus_info_'+str(num_clu)+'.csv')
    return turb_info
    
def train_data(time_res, gen_cf, clus_info):

    if time_res == 'yearly':
        gen_data = gen_cf.groupby(['year','ID'], as_index=False)[['obs','sim']].mean()
        gen_data['yearly'] = 'year'
        
    else:
        gen_data = gen_cf.groupby(['year',time_res,'ID'], as_index=False)[['obs','sim']].mean()

    gen_data = pd.merge(gen_data, clus_info[['ID', 'cluster', 'lon', 'lat', 'capacity', 'height', 'model']], on='ID', how='left')
    bias_data = calculate_scalar(time_res, gen_data)

    return bias_data

def simulate_wind_speed(reanalysis, turb_info):
    reanalysis = reanalysis.assign_coords(
        height=('height', turb_info['height'].unique()))
    
    # calculating wind speed from reanalysis dataset variables
    ws = reanalysis.wnd100m * (np.log(reanalysis.height/ reanalysis.roughness) / np.log(100 / reanalysis.roughness))
    
    # creating coordinates to spatially interpolate to
    lat =  xr.DataArray(turb_info['lat'], dims='turbine', coords={'turbine':turb_info['ID']})
    lon =  xr.DataArray(turb_info['lon'], dims='turbine', coords={'turbine':turb_info['ID']})
    height =  xr.DataArray(turb_info['height'], dims='turbine', coords={'turbine':turb_info['ID']})

    # spatial interpolating to turbine positions
    sim_ws = ws.interp(
            lon=lon, lat=lat, height=height,
            kwargs={"fill_value": None})
    
    return sim_ws

def speed_to_power(column, powerCurveFile, turb_info):
    x = powerCurveFile['data$speed']
    turb_name = turb_info.loc[turb_info['ID'] == column.name, 'model']           
    y = powerCurveFile[turb_name].to_numpy().flatten()
    f = interpolate.Akima1DInterpolator(x, y)
    return f(column)


def simulate_wind_train(reanalysis, turb_info, powerCurveFile, scalar=1, offset=0): 
    # calculating wind speed from reanalysis data
    sim_ws = simulate_wind_speed(reanalysis, turb_info)
    unc_ws = sim_ws.to_pandas()
    cor_ws = (unc_ws * scalar) + offset
    # converting to power
    cor_cf = cor_ws.apply(speed_to_power, args=(powerCurveFile, turb_info), axis=0)
    return np.mean(cor_cf)


def find_offset(row, turb_info, reanalysis, powerCurveFile):

    # start_time = time.time()
    if row['time_slice'] == 'spring':
        time_slice = [3,4,5]
    elif row['time_slice'] == 'summer':
        time_slice = [6,7,8]
    elif row['time_slice'] == 'autumn':
        time_slice = [9,10,11]
    elif row['time_slice'] == 'winter':
        time_slice = [1,2,12]
    elif row['time_slice'] == '1/6':
        time_slice = [1,2]
    elif row['time_slice'] == '2/6':
        time_slice = [3,4]
    elif row['time_slice'] == '3/6':
        time_slice = [5,6]
    elif row['time_slice'] == '4/6':
        time_slice = [7,8]
    elif row['time_slice'] == '5/6':
        time_slice = [9,10]
    elif row['time_slice'] == '6/6':
        time_slice = [11,12]
    elif row['time_slice'] == 'year':
        time_slice = [1,2,3,4,5,6,7,8,9,10,11,12]
    else:
        time_slice = int(row['time_slice'])
    
    # end_time = time.time()
    # elapsed_time = end_time - start_time
    # print("If statements took: ", elapsed_time)
    
    # decide our initial search step size
    stepSize = -0.64
    if (row.sim > row.obs):
        stepSize = 0.64
        
    # start_time = time.time()
    myOffset = 0
    while np.abs(stepSize) > 0.002: # Stop when step-size is smaller than our power curve's resolution
        myOffset += stepSize # If we are still far from energytarget, increase stepsize
        
        # calculate the mean simulated CF using the new offset
        mean_sim_cf = simulate_wind_train(
            reanalysis.sel(
                    time=np.logical_and(
                    reanalysis.time.dt.year == row.year, 
                    reanalysis.time.dt.month.isin(time_slice)
                )
            ),
            turb_info.loc[turb_info['cluster'] == row.cluster],
            powerCurveFile, 
            row.scalar, 
            myOffset
        )
        
        # if we have overshot our target, then repeat, searching the other direction
        # ((guess < target & sign(step) < 0) | (guess > target & sign(step) > 0))
        if mean_sim_cf != 0:
            if np.sign(mean_sim_cf - row.obs) == np.sign(stepSize):
                stepSize = -stepSize / 2
            # If we have reached unreasonable places, stop
            if myOffset < -20 or myOffset > 20:
                break
        elif mean_cf == 0:
            myOffset = 0
            break

    # end_time = time.time()
    # elapsed_time = end_time - start_time
    # print("While loop took: ", elapsed_time)
    return myOffset


######################## Test
def closest_cluster(clus_info, turb_info):
    """
    Assign turbines not found in training data to closest cluster.
    """
    # making sure ID column dtype is same   
    clus_info['ID'] = clus_info['ID'].astype(str)
    turb_info['ID'] = turb_info['ID'].astype(str)
    
    avg = clus_info.groupby(['cluster'], as_index=False)[['lat','lon']].mean()
    turb_info = pd.DataFrame.merge(clus_info[['ID','cluster']], turb_info, on='ID', how='right')

    for i in range(len(turb_info)):
        if np.isnan(turb_info.cluster[i]) == True:
            # Find the cluster center closest to the new turbine
            # - find smallest distance between the new turbine and cluster centers
            indx = np.argmin(np.sqrt((avg.lat.values - turb_info.lat[i])**2 + (avg.lon.values - turb_info.lon[i])**2))
            turb_info.cluster[i] = avg.cluster[indx]

    turb_info = turb_info.reset_index(drop=True)

    return turb_info


def simulate_wind(reanalysis, turb_info, powerCurveFile, *args): 
    # calculating wind speed from reanalysis data
    # start_time = time.time()
    sim_ws = simulate_wind_speed(reanalysis, turb_info)
    sim_ws = sim_ws.to_pandas()

    if len(args) >= 1: 
        bc_factors = args[0]
        
        sim_ws = sim_ws.reset_index()
        sim_ws = sim_ws.melt(id_vars=["time"], # adding in turbine ID for merging
            var_name="ID", 
            value_name="ws")


        sim_ws = add_times(sim_ws)
        sim_ws = add_time_res(sim_ws)
        
        sim_ws['month'] = sim_ws['month'].astype(str)
        bc_factors[time_res] = bc_factors[time_res].astype(str)
        # turb_info['ID'] = turb_info['ID'].astype(str)
        # bc_factors['ID'] = bc_factors['ID'].astype(str)
        sim_ws = pd.merge(sim_ws, turb_info[['ID','cluster']], on=['ID'], how='left')
        sim_ws = pd.merge(sim_ws, bc_factors, on=['cluster',time_res], how='left')
        sim_ws['ws'] = (sim_ws.ws * sim_ws.scalar) + sim_ws.offset # equation 2
        sim_ws = sim_ws.pivot(index=['time'], columns='ID', values='ws')

    sim_cf = sim_ws.apply(speed_to_power, args=(powerCurveFile, turb_info), axis=0)

    return sim_ws.reset_index(), sim_cf.reset_index()

## Full train
currently trying to speed up the speed to power so I can run this

In [4]:
year_star = 2015 # start year of training period
year_end = 2019 # end year of training period
year_test = 2020 # year you wish to receive a time series for

powerCurveFileLoc = 'data/turbine_info/Wind Turbine Power Curves.csv'
powerCurveFile = pd.read_csv(powerCurveFileLoc)

train_era5 = prep_era5(True)
train_obs_cf, train_turb_info = prep_obs("DK", year_star, year_end)
train_gen_cf = match_generation_data(train_era5, train_obs_cf, train_turb_info, powerCurveFile)

Number of turbines before preprocessing:  5682
Number of turbines used in training:  3712


In [5]:
cluster_list = [1,2,3,5,10,15,20,30,50,100,150,200,300,400]
time_res_list = ['yearly', 'season', 'bimonth', 'month'] 

# cluster_list = [1,2]
# time_res_list = ['yearly', 'season', 'bimonth', 'month']

for num_clu in cluster_list:
    train_clus_info = cluster_turbines(num_clu, train_turb_info)

    for time_res in time_res_list:
        print("Train for ", num_clu, " clusters with time resolution: ", time_res, " is taking place.")
        start_time = time.time()
        
        bias_data = train_data(time_res, train_gen_cf, train_clus_info)
        ddf = dd.from_pandas(bias_data, npartitions=40)
        
        def find_offset_parallel(df):
            return df.apply(find_offset, args=(train_clus_info, train_era5, powerCurveFile), axis=1)
            
        ddf["offset"] = ddf.map_partitions(find_offset_parallel, meta=('offset', 'float'))
        ddf.to_csv('data/results/new_factors/'+time_res+'_'+str(num_clu)+'.csv', single_file=True, compute_kwargs={'scheduler':'processes'})
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Results completed and saved. Elapsed time: {:.2f} seconds".format(elapsed_time))
        print(" ")

## this works
# for num_clu, time_res in itertools.product(cluster_list, time_res_list):
#     print("Train for ", num_clu, " clusters with time resolution: ", time_res, " is taking place.")
#     start_time = time.time()
#     gen_data, bias_data = train_data(num_clu, time_res, train_gen_cf, train_turb_info)
#     # ddf = dd.from_pandas(bias_data, npartitions=40)
#     # ddf["offset"] = ddf.map_partitions(find_offset_parallel, meta=('offset', 'float'))
#     # ddf.to_csv('data/results/new_factors/'+time_res+'_'+str(num_clu)+'.csv', single_file=True, compute_kwargs={'scheduler':'processes'})
#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     print("Results completed and saved. Elapsed time: {:.2f} seconds".format(elapsed_time))
#     print(" ")

Train for  1  clusters with time resolution:  yearly  is taking place.
Results completed and saved. Elapsed time: 167.87 seconds
 
Train for  1  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 181.74 seconds
 
Train for  1  clusters with time resolution:  bimonth  is taking place.
Results completed and saved. Elapsed time: 186.63 seconds
 
Train for  1  clusters with time resolution:  month  is taking place.
Results completed and saved. Elapsed time: 367.23 seconds
 
Train for  2  clusters with time resolution:  yearly  is taking place.
Results completed and saved. Elapsed time: 114.87 seconds
 
Train for  2  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 108.25 seconds
 
Train for  2  clusters with time resolution:  bimonth  is taking place.
Results completed and saved. Elapsed time: 173.53 seconds
 
Train for  2  clusters with time resolution:  month  is taking place.
Results comp

## Test

In [7]:
from vwf.preprocessing import prep_obs_test

In [14]:
year_test = 2020
test_era5 = prep_era5()
test_turb_info = prep_obs_test("DK", year_test)

cluster_list = [1,2,3,5,10,15,20,30,50,100,150,200,300,400]
time_res_list = ['yearly', 'season', 'bimonth', 'month'] 
# cluster_list = [1,2]
# time_res_list = ['month']


# running the full test results
unc_ws, unc_cf = simulate_wind(test_era5, test_turb_info, powerCurveFile)
unc_ws.to_csv('data/results/raw/'+str(year_test)+'_unc_ws.csv', index = None)
unc_cf.to_csv('data/results/raw/'+str(year_test)+'_unc_cf.csv', index = None)

for num_clu in cluster_list:
    train_clus_info = pd.read_csv('data/results/new_factors/clusters/clus_info_'+str(num_clu)+'.csv')
    test_clus_info = closest_cluster(train_clus_info, test_turb_info)

    for time_res in time_res_list:
        print("Test for ", num_clu, " clusters with time resolution: ", time_res, " is taking place.")
        start_time = time.time()
        
        bias_data = pd.read_csv('data/results/new_factors/'+time_res+'_'+str(num_clu)+'.csv')
        bc_factors = bias_data.groupby(['cluster', 'time_slice'], as_index=False).agg({'scalar': 'mean', 'offset': 'mean'})
        bc_factors.columns = ['cluster',time_res,'scalar','offset']

        cor_ws, cor_cf = simulate_wind(test_era5, test_clus_info, powerCurveFile, bc_factors)
        cor_ws.to_csv('data/results/raw/'+str(year_test)+'_'+time_res+'_'+str(num_clu)+'_cor_ws.csv', index = None)
        cor_cf.to_csv('data/results/raw/'+str(year_test)+'_'+time_res+'_'+str(num_clu)+'_cor_cf.csv', index = None)

        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Results completed and saved. Elapsed time: {:.2f} seconds".format(elapsed_time))
        print(" ")

Test for  1  clusters with time resolution:  yearly  is taking place.
Results completed and saved. Elapsed time: 8.35 seconds
 
Test for  1  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 7.95 seconds
 
Test for  1  clusters with time resolution:  bimonth  is taking place.
Results completed and saved. Elapsed time: 7.88 seconds
 
Test for  1  clusters with time resolution:  month  is taking place.
Results completed and saved. Elapsed time: 7.91 seconds
 
Test for  2  clusters with time resolution:  yearly  is taking place.
Results completed and saved. Elapsed time: 7.80 seconds
 
Test for  2  clusters with time resolution:  season  is taking place.
Results completed and saved. Elapsed time: 7.82 seconds
 
Test for  2  clusters with time resolution:  bimonth  is taking place.
Results completed and saved. Elapsed time: 7.76 seconds
 
Test for  2  clusters with time resolution:  month  is taking place.
Results completed and saved. Elaps

In [12]:
bc_factors

Unnamed: 0,cluster,yearly,scalar,offset
0,0,year,0.801406,0.8125


In [None]:
sim_ws = unc_ws.reset_index()
sim_ws = sim_ws.melt(id_vars=["time"], # adding in turbine ID for merging
    var_name="ID", 
    value_name="ws")
sim_ws = add_times(sim_ws)
sim_ws = add_time_res(sim_ws)
sim_ws

In [None]:
bc_factors

In [None]:
cor_cf

In [None]:
cor = cor_cf.reset_index()
cor = cor.melt(id_vars=["time"], # adding in turbine ID for merging
        var_name="ID", 
        value_name="cor")
cor_month = cor.groupby([pd.Grouper(key='time',freq='M')])['cor'].mean().reset_index()
    
unc = unc_cf.reset_index()
unc = unc.melt(id_vars=["time"], # adding in turbine ID for merging
    var_name="ID", 
    value_name="unc")
unc_month = unc.groupby([pd.Grouper(key='time',freq='M')])['unc'].mean().reset_index()

obs = pd.read_csv('data/wind_data/DK/obs_cf_test.csv', parse_dates=['time'])
obs = obs.melt(id_vars=["time"], # adding in turbine ID for merging
    var_name="ID", 
    value_name="obs")
obs_month = obs.groupby([pd.Grouper(key='time',freq='M')])['obs'].mean().reset_index()

cf_month = obs_month.merge(unc_month,on=['time']).merge(cor_month,on=['time'])
cf_month.columns = ["time", 'obs', 'unc','cor']
cf_month["time"] = cf_month["time"].dt.month
cf_month = cf_month.melt(id_vars=["time"],
            var_name="model", 
            value_name="CF")


sns.lineplot(
    data = cf_month,
    x="time",
    y="CF",
    hue="model",
    style="model",
    legend = True,
)

### Attempt at changing

In [77]:
import utm

def prep_obs(country, year_star, year_end):
    
    if country == "DK":
        """
        For Denmark's data there had to be a lot of manual manipulation of the excel file. 
        I had to manually match the turbines that exist in the power curves file, with Denmarks naming convention then match it to the ID's. 
        anlaeg.xlsx is the raw file and match_turb_dk.xlsx is where the matching is done.
        After this we are required to fill in missing turbine matches and also convert the coordinate system.
        We also produce the observational data which is again manually seperated into yearly sheets from a megasheet for the years we desire.
        As the observational data is power output we converted that to capacity factor with the matched turbines.
        the ID's here are the gsrn ID
        """
        ##############################
        # producing turb_info
        ##############################
        # reading in the messy denmark turbine info that we have matched
        df = pd.read_excel('data/wind_data/DK/raw/match_turb_dk.xlsx')
        columns = ['Turbine identifier (GSRN)','Capacity (kW)','X (east) coordinate\nUTM 32 Euref89','Y (north) coordinate\nUTM 32 Euref89','Hub height (m)', 'Date of original connection to grid', 'turb_match']
        df = df[columns]
        rename_col = ['ID','capacity','x_east_32','y_north_32','height', 'date', 'model']
        df.columns = rename_col
        df = df.dropna()

        # matching modelless turbines with closest model via capacity
        metadata = pd.read_csv('data/turbine_info/models.csv')
        metadata = metadata.sort_values('capacity')

        df['model'][df['model'] == 0] = np.nan
        df['capacity'] = df['capacity'].astype(int)
        df = df.sort_values('capacity').reset_index(drop=True)
        df.loc[df['model'].isna(), 'model'] = pd.merge_asof(df, metadata, left_on=["capacity"], right_on=["capacity"], direction="nearest")['model_y']

        # convert coordinate system
        def rule(row):
            lat, lon = utm.to_latlon(row["x_east_32"], row["y_north_32"], 32, 'W')
            return pd.Series({"lat": lat, "lon": lon})

        df = df.merge(df.apply(rule, axis=1), left_index= True, right_index= True)
        df = df[['ID','capacity','lat','lon','height', 'date', 'model']]
        df['ID'] = df['ID'].astype(str)
        print("Number of turbines before preprocessing: ", len(df))
        turb_info = df.drop(df[df['height'] < 1].index).reset_index(drop=True)


        ##############################
        # producing obs_cf
        ##############################

        # Load observation data and slice the observed CF for chosen years
        appended_data = []
        for i in range(year_star, year_end+1): # change this back to +1 when i dont need 2020 obs
            data = pd.read_excel('data/wind_data/DK/observation/Denmark_'+str(i)+'.xlsx')
            data = data.iloc[3:,np.r_[0:1, 3:15]] # the slicing done here is file dependent please consider this when other files are used
            data.columns = ['ID','1','2','3','4','5','6','7','8','9','10','11','12']
            data['ID'] = data['ID'].astype(str)
            data = data.reset_index(drop=True)
            data['year'] = i

            appended_data.append(data[:-1])

        obs_gen = pd.concat(appended_data).reset_index(drop=True)
        obs_gen.columns = [f'obs_{i}' if i not in ['ID', 'year'] else f'{i}' for i in obs_gen.columns]

        # converting obs_gen into obs_cf by turning power into capacity factor
        df = pd.merge(obs_gen, turb_info[['ID', 'capacity']],  how='left', on=['ID'])
        df = df.dropna().reset_index(drop=True)

        def daysDuringMonth(yy, m):
            result = []    
            [result.append(monthrange(y, m)[1]) for y in yy]        
            return result

        for i in range(1,13):
            df['obs_'+str(i)] = df['obs_'+str(i)]/(((daysDuringMonth(df.year, i))*df['capacity'])*24)

        df = df.drop(['capacity'], axis=1).reset_index(drop=True)
        df['cf_max'] = df.iloc[:,1:13].max(axis=1)
        df = df.drop(df[df['cf_max'] > 1].index)
        df['cf_min'] = df.iloc[:,1:13].min(axis=1)
        df = df.drop(df[df['cf_min'] <= 0.01].index)
        df['cf_mean'] = df.iloc[:,1:13].mean(axis=1)
        df = df.drop(df[df['cf_mean'] <= 0.01].index)
        obs_cf = df.drop(['cf_mean', 'cf_max', 'cf_min'], axis=1).reset_index(drop=True)

        obs_cf = obs_cf.loc[obs_cf['ID'].isin(turb_info['ID'])].reset_index(drop=True)
        obs_cf = obs_cf[obs_cf.groupby('ID').ID.transform('count') == ((year_end-year_star)+1)].reset_index(drop=True)
        obs_cf.columns = ['ID','1','2','3','4','5','6','7','8','9','10','11','12','year']
        obs_cf = obs_cf.melt(id_vars=["ID", "year"], 
                        var_name="month", 
                        value_name="obs")
        # obs_cf.to_csv('data/wind_data/DK/obs_cf_train.csv', index = None)
        
        turb_info = turb_info.loc[turb_info['ID'].isin(obs_cf['ID'])].reset_index(drop=True)
        # turb_info.to_csv('data/wind_data/DK/turb_info_train.csv', index = None)
        
        print("Number of turbines used in training: ", len(turb_info))
        return obs_cf, turb_info


def add_time_res(df):
    df.loc[df['month'] == 1, ['bimonth','season']] = ['1/6', 'winter']
    df.loc[df['month'] == 2, ['bimonth','season']] = ['1/6', 'winter']
    df.loc[df['month'] == 3, ['bimonth','season']] = ['2/6', 'spring']
    df.loc[df['month'] == 4, ['bimonth','season']] = ['2/6', 'spring']
    df.loc[df['month'] == 5, ['bimonth','season']] = ['3/6', 'spring']
    df.loc[df['month'] == 6, ['bimonth','season']] = ['3/6', 'summer']
    df.loc[df['month'] == 7, ['bimonth','season']] = ['4/6', 'summer']
    df.loc[df['month'] == 8, ['bimonth','season']] = ['4/6', 'summer']
    df.loc[df['month'] == 9, ['bimonth','season']] = ['5/6', 'autumn']
    df.loc[df['month'] == 10, ['bimonth','season']] = ['5/6', 'autumn']
    df.loc[df['month'] == 11, ['bimonth','season']] = ['6/6', 'autumn']
    df.loc[df['month'] == 12, ['bimonth','season']] = ['6/6', 'winter']
    df['year_res'] = 'year'
    return df

def match_generation_data(reanalysis, obs_cf, turb_info, powerCurveFile):

    sim_ws, sim_cf = simulate_wind_test(reanalysis, turb_info, powerCurveFile)
    
    sim_ws = sim_ws.melt(id_vars=["time"], 
                    var_name="ID", 
                    value_name="ws")
    sim_ws = add_times(sim_ws)
    sim_ws = add_time_res(sim_ws)

    
    sim_cf = sim_cf.groupby(pd.Grouper(key='time',freq='M')).mean().reset_index()
    sim_cf = sim_cf.melt(id_vars=["time"], 
                    var_name="ID", 
                    value_name="sim")
    sim_cf = add_times(sim_cf)
    sim_cf = add_time_res(sim_cf)

    sim_cf['ID'] = sim_cf['ID'].astype(str)
    sim_cf['month'] = sim_cf['month'].astype(int)
    sim_cf['year'] = sim_cf['year'].astype(int)
    obs_cf['ID'] = obs_cf['ID'].astype(str)
    obs_cf['month'] = obs_cf['month'].astype(int)
    obs_cf['year'] = obs_cf['year'].astype(int)
    
    gen_cf = pd.merge(sim_cf, obs_cf, on=['ID', 'month', 'year'], how='left')
    gen_cf = gen_cf.drop(['time'], axis=1).reset_index(drop=True)
    
    return gen_cf, sim_ws


def calculate_scalar(time_res, gen_data):
    scalar_alpha = 0.6
    scalar_beta = 0.2

    bias_data = gen_data.groupby([time_res, 'cluster', 'year'])[['obs','sim']].mean()
    bias_data['scalar'] = (scalar_alpha * (bias_data['obs'] / bias_data['sim'])) + scalar_beta
    bias_data = bias_data.reset_index()
        
    bias_data.columns = ['time_slice', 'cluster', 'year', 'obs', 'sim', 'scalar']   
    return bias_data[['year', 'time_slice', 'cluster', 'obs', 'sim', 'scalar']]


def cluster_turbines(num_clu, turb_info):
    # generating the cluster labels 
    kmeans = KMeans(
        init="random",
        n_clusters = num_clu,
        n_init = 10,
        max_iter = 300,
        random_state = 42
    )
    
    lat = turb_info['lat']
    lon = turb_info['lon']
    df = pd.DataFrame(list(zip(lat, lon)), columns =['lat', 'lon'])
    kmeans.fit(df)
    turb_info['cluster'] = kmeans.labels_
    turb_info.to_csv('data/results/new_factors/clusters/clus_info_'+str(num_clu)+'.csv')
    return turb_info
    
def train_data(time_res, gen_cf, clus_info):
    
    gen_data = gen_cf.groupby(['year',time_res,'ID'], as_index=False)[['obs','sim']].mean()

    gen_data = pd.merge(gen_data, clus_info[['ID', 'cluster', 'lon', 'lat', 'capacity', 'height', 'model']], on='ID', how='left')
    bias_data = calculate_scalar(time_res, gen_data)

    return bias_data


    
def simulate_wind_speed(reanal_data, turb_info):
    reanal_data = reanal_data.assign_coords(
        height=('height', turb_info['height'].unique()))
    
    # calculating wind speed from reanalysis dataset variables
    ws = reanal_data.wnd100m * (np.log(reanal_data.height/ reanal_data.roughness) / np.log(100 / reanal_data.roughness))
    
    # creating coordinates to spatially interpolate to
    lat =  xr.DataArray(turb_info['lat'], dims='turbine', coords={'turbine':turb_info['ID']})
    lon =  xr.DataArray(turb_info['lon'], dims='turbine', coords={'turbine':turb_info['ID']})
    height =  xr.DataArray(turb_info['height'], dims='turbine', coords={'turbine':turb_info['ID']})

    # spatial interpolating to turbine positions
    sim_ws = ws.interp(
            lon=lon, lat=lat, height=height,
            kwargs={"fill_value": None})
    
    return sim_ws

def speed_to_power(column, powerCurveFile, turb_info):
    x = powerCurveFile['data$speed']
    turb_name = turb_info.loc[turb_info['ID'] == column.name, 'model']           
    y = powerCurveFile[turb_name].to_numpy().flatten()
    f = interpolate.Akima1DInterpolator(x, y)
    return f(column)


def simulate_wind_train(sim_ws, turb_info, powerCurveFile, scalar=1, offset=0): 
    
    # calculating wind speed from reanalysis data
    sim_ws['ws'] = (sim_ws['ws'] * scalar) + offset
    sim_ws = sim_ws.pivot(index=['time'], columns='ID', values='ws')
    sim_cf = sim_ws.apply(speed_to_power, args=(powerCurveFile, turb_info), axis=0)
    
    return sim_cf


def find_offset(row, turb_info, sim_ws, powerCurveFile, time_res):

    # decide our initial search step size
    stepSize = -0.64
    if (row.sim > row.obs):
        stepSize = 0.64
        
    # start_time = time.time()
    myOffset = 0
    while np.abs(stepSize) > 0.002: # Stop when step-size is smaller than our power curve's resolution
        myOffset += stepSize # If we are still far from energytarget, increase stepsize
        
        # calculate the mean simulated CF using the new offset
        sim_cf = simulate_wind_train(
            sim_ws.loc[(sim_ws['year'] == row.year) & (sim_ws[time_res] == row.time_slice) & (sim_ws['cluster'] == row.cluster)],
            turb_info.loc[turb_info['cluster'] == row.cluster],
            powerCurveFile, 
            row.scalar, 
            myOffset
        )

        mean_cf = np.mean(sim_cf)
        
        # if we have overshot our target, then repeat, searching the other direction
        # ((guess < target & sign(step) < 0) | (guess > target & sign(step) > 0))
        if mean_cf != 0:
            sim = mean_cf
            if np.sign(sim - row.obs) == np.sign(stepSize):
                stepSize = -stepSize / 2
            # If we have reached unreasonable places, stop
            if myOffset < -20 or myOffset > 20:
                break
        elif mean_cf == 0:
            myOffset = 0
            break

    # end_time = time.time()
    # elapsed_time = end_time - start_time
    # print("While loop took: ", elapsed_time)
    return myOffset

year_star = 2015 # start year of training period
year_end = 2019 # end year of training period
year_test = 2020 # year you wish to receive a time series for

powerCurveFileLoc = 'data/turbine_info/Wind Turbine Power Curves.csv'
powerCurveFile = pd.read_csv(powerCurveFileLoc)

train_era5 = prep_era5(True)
train_obs_cf, train_turb_info = prep_obs("DK", year_star, year_end)
train_gen_cf, train_all_sim_ws  = match_generation_data(train_era5, train_obs_cf, train_turb_info, powerCurveFile)
cluster_list = [5]
time_res_list = ['year_res', 'season', 'bimonth', 'month'] 

# cluster_list = [1,2]
# time_res_list = ['yearly', 'season', 'bimonth', 'month']

for num_clu in cluster_list:
    train_clus_info = cluster_turbines(num_clu, train_turb_info)
    train_sim_ws = pd.merge(train_all_sim_ws, train_clus_info[['ID', 'cluster', 'lon', 'lat', 'capacity', 'height', 'model']], on='ID', how='left')

    for time_res in time_res_list:
        print("Train for ", num_clu, " clusters with time resolution: ", time_res, " is taking place.")
        start_time = time.time()
        
        bias_data = train_data(time_res, train_gen_cf, train_clus_info)
        ddf = dd.from_pandas(bias_data, npartitions=40)
        
        def find_offset_parallel(df):
            return df.apply(find_offset, args=(train_clus_info, train_sim_ws, powerCurveFile, time_res), axis=1)
            
        ddf["offset"] = ddf.map_partitions(find_offset_parallel, meta=('offset', 'float'))
        ddf.to_csv('data/results/new_factors/'+time_res+'_'+str(num_clu)+'.csv', single_file=True, compute_kwargs={'scheduler':'processes'})
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Results completed and saved. Elapsed time: {:.2f} seconds".format(elapsed_time))
        print(" ")

## Trying to improve .apply

In [None]:
time_res = 'season'
num_clu = 10
test_gen = gen_data.loc[(gen_data['year'] == 2015) & (gen_data[time_res] == 'autumn') & (gen_data['cluster'] == num_clu-1)]
test_era5 = train_era5.sel(
                    time=np.logical_and(
                    train_era5.time.dt.year == 2015, 
                    train_era5.time.dt.month.isin([9,10,11])
                )
            )

In [None]:
%%time
cor_ws = simulate_wind_train(test_era5, test_gen, powerCurveFile, 0.8, 0)
# row = bias_data.iloc[0,:]
# find_offset(row, test_gen, test_era5, powerCurveFile)

In [None]:
# %%time
# cor_cf = cor_ws.copy()
# x = powerCurveFile['data$speed']
# for i in range(0, len(cor_cf.columns)):          
#     speed_single = cor_cf.iloc[:,i]
#     turb_name = test_gen.loc[test_gen['ID'] == speed_single.name, 'model']           
#     y = powerCurveFile[turb_name].to_numpy().flatten()
#     f = interpolate.Akima1DInterpolator(x, y)
#     cor_cf.iloc[:,i] = f(speed_single)

In [None]:
# %%time
# def to_power(column, powerCurveFile, test_gen):
#     x = powerCurveFile['data$speed']
#     turb_name = test_gen.loc[test_gen['ID'] == column.name, 'model']           
#     y = powerCurveFile[turb_name].to_numpy().flatten()
#     f = interpolate.Akima1DInterpolator(x, y)
#     return f(column)

# cor_ws.apply(to_power, args=(powerCurveFile, test_gen), axis=0)

In [None]:
# def f1(column, turb_info):
#     turb_name = turb_info.loc[turb_info['ID'] == column.columns, 'model']
#     x = powerCurveFile['data$speed']
#     y = powerCurveFile[turb_name]
#     f = interpolate.Akima1DInterpolator(x, y)
#     return df.asign(f(column))

# def f2(df):
#     return df.apply(f1, args=(turb_info_test), axis=0)

# try_cf = unc_ws.iloc[:,:1]
# ddf = dd.from_pandas(unc_ws.iloc[:,1:], npartitions=40)
# p = ddf.map_partitions(f2, meta=(None, 'f8'))

In [None]:
# %%time
# p.compute(scheduler='processes')