In [77]:
import warnings
import datetime
import re
import pandas as pd
import numpy as np
from shapely.geometry import Point, LineString
from scipy import spatial

In [78]:
ignore_projects = [
       335 # North Sea Wind Power Hub
]

# TODO: is 'investment_id' unique? Otherwise, we must check also check 'project_id'
ignore_investments = [
       1652 # line from new Bodelwyddan converter station to Bodelwyddan 2 ('project_id'=349)
]

In [79]:
df = pd.read_csv('2020/tyndp_2020.csv')
df = df.loc[~df['project_id'].isin(ignore_projects)]
df = df.loc[~df['investment_id'].isin(ignore_investments)]
df.head()

Unnamed: 0,asset_type,substation_1,substation_2,x1,y1,x2,y2,status,specified_length_km,commissioning_year,ac_dc,underground,voltage,p_nom_max,investment_id,project_id,remarks,url,description
0,line,Pedralva (PT),Sobrado (PT),-8.322974,41.577311,,,planned_not_yet_permitting,67.0,2024,AC,False,400.0,,2,1,,https://tyndp2020-project-platform.azurewebsit...,New double circuit Pedralva (PT) - Sobrado (PT...
1,line,Vieira do Minho (PT),Ribeira de Pena (PT),-7.763214,41.58258,-7.793621,41.521024,in_permitting,131.0,2022,AC,False,400.0,,4,1,,https://tyndp2020-project-platform.azurewebsit...,New double-circuit 400kV OHL Vieira do Minho (...
2,line,Ribeira de Pena (PT),Feira (PT),-7.793621,41.521024,-8.389435,40.95916,in_permitting,131.0,2022,AC,False,400.0,,4,1,,https://tyndp2020-project-platform.azurewebsit...,New double-circuit 400kV OHL Vieira do Minho (...
3,substation,Ribeira de Pena (PT),,-7.793621,41.521024,,,in_permitting,,2021,AC,False,,,474,1,,https://tyndp2020-project-platform.azurewebsit...,New 400/60kV substation in Ribeira de Pena.
4,line,Beariz (ES),Fontefria (ES),-8.27387,42.467486,-7.949982,42.38391,in_permitting,30.0,2022,AC,False,400.0,,18,4,,https://tyndp2020-project-platform.azurewebsit...,New northern interconnection. New double circu...


In [80]:
# Split into lines and buses
tyndp_buses = df.loc[df.asset_type == 'substation']
tyndp_lines = df.loc[df.asset_type == 'line']

# Ignore entries with missing coordinates
# (various reasons: incomplete project specification, outside of PyPSA-Eur area,...)
tyndp_buses = tyndp_buses.dropna(subset=['x1', 'y1'])
tyndp_lines_links = tyndp_lines.dropna(subset=['x1', 'y1', 'x2', 'y2'])

tyndp_lines = tyndp_lines_links.loc[tyndp_lines_links.ac_dc == 'AC']
tyndp_links = tyndp_lines_links.loc[tyndp_lines_links.ac_dc == 'DC']

# Extract all buses occuring in 'tyndp_lines'
buses_1 = tyndp_lines.loc[:, ('substation_1','x1','y1')]
buses_1.columns = ['substation','x','y']

buses_2 = tyndp_lines.loc[:, ('substation_2','x2','y2')]
buses_2.columns = ['substation','x','y']

all_buses = pd.concat([buses_1, buses_2])
all_buses = all_buses.drop_duplicates()

# check if there are substations with same name but different coordinates
duplicates = all_buses.duplicated(subset=['substation'], keep=False)
if not all_buses.loc[duplicates,:].empty:
    s = 'There are substations with multiple different coordinate values:\n'
    s += str(all_buses.loc[duplicates, :].sort_values('substation'))
    raise ValueError(s)

# Convert tyndp_buses to format of 'buses.csv'

In [81]:
tyndp_buses = tyndp_buses.drop(['substation_2', 
                'x2',
                'y2',
                'asset_type',
                'specified_length_km',
                'underground',
                'p_nom_max'],
                axis=1)
tyndp_buses = tyndp_buses.rename(columns={
    'substation_1': 'name',
    'x1': 'x',
    'y1': 'y',
    'voltage': 'v_nom',
    'status': 'tyndp_status',
    'project_id': 'tyndp2020_proj_id',
    'investment_id': 'tyndp2020_invest_id'
})

tyndp_buses['dc'] = tyndp_buses.ac_dc.map({'AC': False, 'DC': True})
tyndp_buses = tyndp_buses.drop('ac_dc', axis=1)

# TODO: extract substation name nad country to 'tags' column, set 'symbol' column to "Substation"

In [82]:
reg = r'(?P<name>.+)\s?[\[(](?P<country>\w{2})[)\]]'
pat = re.compile(reg)

In [83]:
def extract_name(v):
    m = pat.match(v)
    return m.group('name').strip() if m else v.strip()

def extract_country(v):
    m = pat.match(v)
    return m.group('country').strip() if m else np.nan

tyndp_buses['country'] = tyndp_buses['name'].apply(extract_country)
tyndp_buses['name']    = tyndp_buses['name'].apply(extract_name)

# create tags
tag_cols = ['name',
            'country',
            'url',
            'tyndp2020_proj_id',
            'tyndp2020_invest_id',
            'tyndp_status']
create_tags = lambda row: ', '.join([f'"{key}"=>"{row[key]}"' for key in tag_cols if key in row.index])
tyndp_buses.loc[:, 'tags'] = tyndp_buses.apply(create_tags, axis=1)
tyndp_buses = tyndp_buses.drop(tag_cols, axis=1)

In [84]:
tyndp_buses.head()

Unnamed: 0,x,y,commissioning_year,v_nom,remarks,description,dc,tags
3,-7.793621,41.521024,2021,,,New 400/60kV substation in Ribeira de Pena.,False,"""name""=>""Ribeira de Pena"", ""country""=>""PT"", ""u..."
7,-8.27387,42.467486,2022,400.0,,New northern interconnection. New 400kV substa...,False,"""name""=>""Beariz"", ""country""=>""ES"", ""url""=>""htt..."
8,-8.582626,41.764652,2022,150.0,,"New 400/150kV substation Ponte de Lima (PT), p...",False,"""name""=>""Ponte de Lima"", ""country""=>""PT"", ""url..."
31,-8.0041,54.6773,2030,275.0,,new substation in south Co. Donegal,False,"""name""=>""South Donegal"", ""country""=>""IE"", ""url..."
33,-7.2811,54.5781,2030,275.0,,A new 275 kV cross border link between a new s...,False,"""name""=>""Omagh South"", ""country""=>""NI"", ""url""=..."


# Find closest buses in 'buses.csv'

In [85]:
# Load buses.csv
buses_file = r'entsoegridkit/buses.csv'
buses = (pd.read_csv(buses_file, quotechar="'",
                         true_values=['t'], false_values=['f'],
                         dtype=dict(bus_id="str"))
            .set_index("bus_id")
            .drop(['station_id'], axis=1)
            .rename(columns=dict(voltage='v_nom')))

In [86]:
query_buses = tyndp_buses
distance_upper_bound=0.2

treecoords = buses.loc[:,('x', 'y')]
querycoords = query_buses.loc[:,('x','y')]

tree = spatial.KDTree(treecoords)
dist, ind = tree.query(querycoords, distance_upper_bound=distance_upper_bound)
found_b = ind < buses.index.size
found_i = np.arange(query_buses.index.size)[found_b]

tyndp_buses['closest_gridx_bus'] = pd.DataFrame(dict(D=dist[found_b],
                                                i=buses.index[ind[found_b] % buses.index.size]),
                                                index=query_buses.index[found_i]) \
                                                .sort_values(by='D')\
                                                [lambda ds: ~ds.index.duplicated(keep='first')] \
                                                .sort_index()['i']

## New buses

In [87]:
new_buses = tyndp_buses.loc[tyndp_buses['closest_gridx_bus'].isnull()]
new_buses = new_buses.drop('closest_gridx_bus', axis=1)

## Existing buses

In [88]:
# For these, we need to add tag 'name', check whether other tags agree,
# and check whether their voltage has been updated. 
existing_buses = tyndp_buses.loc[~tyndp_buses['closest_gridx_bus'].isnull()]
existing_buses = existing_buses.set_index('closest_gridx_bus')

In [89]:
check_columns = ['v_nom', 'dc']

a = existing_buses.loc[:, check_columns]
b = buses.loc[existing_buses.index].loc[:, check_columns]

In [90]:
# replace NaN values in a with values in b
c = a.combine_first(b)

# take larger values
# TODO: for column 'dc', this would prefer a 'True' value in 'c'.
c = c.where(c > b, b)
existing_buses.loc[:, c.columns] = c

In [91]:
existing_buses.head()

Unnamed: 0_level_0,x,y,commissioning_year,v_nom,remarks,description,dc,tags
closest_gridx_bus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2887,-7.793621,41.521024,2021,380.0,,New 400/60kV substation in Ribeira de Pena.,False,"""name""=>""Ribeira de Pena"", ""country""=>""PT"", ""u..."
2854,-8.27387,42.467486,2022,400.0,,New northern interconnection. New 400kV substa...,False,"""name""=>""Beariz"", ""country""=>""ES"", ""url""=>""htt..."
2811,-8.582626,41.764652,2022,380.0,,"New 400/150kV substation Ponte de Lima (PT), p...",False,"""name""=>""Ponte de Lima"", ""country""=>""PT"", ""url..."
1322,15.3286,40.9749,2021,400.0,,New 30km single circuit 400kV OHL between the ...,False,"""name""=>""Bisaccia"", ""country""=>""IT"", ""url""=>""h..."
3264,26.7166,45.0762,2023,400.0,,To reinforce the cross-section between the Bla...,False,"""name""=>""Stalpu"", ""country""=>""RO"", ""url""=>""htt..."


In [92]:
# TODO: Handle counterparts which have no tags.
# Such counterparts are usually synthetic buses with symbol='joint'. See https://github.com/bdw/GridKit/tree/master/entsoe
closest_gridx_without_tags = existing_buses.loc[buses.tags.isna()]

# which columns do we keep?
# unset symbol, update tags, then just take the gridx bus?
# TODO: which of the symbol value does PyPSA-Eur use / drop?
buses.symbol.unique()

array(['Substation', 'Mixed fuels', 'Fossil fuel', 'Wind farm', 'joint',
       'Fossil gas', 'Solar photovoltaic', 'Fossil oil',
       'Substations + Power Plants', 'Hydro mixed pump storage',
       'Solar thermic', 'Hydro run of river & pondage',
       'Brown coal/Lignite', 'Other or not listed', 'Hard coal',
       'Hydro pure storage', 'Phase shifter',
       'Converter Station Back-to-back', 'Nuclear',
       'Hydro pure pump storage', 'Converter Station',
       'Other fossil fuels', 'Coal derived gas', 'Biomass', 'Oil shale',
       'Geothermal'], dtype=object)

In [93]:
# check whether respective tyndp_bus columns agree with 'gridx_bus.tags'.
# 'grix_bus.tags' is a string comprised of comma-separated entries
# of the form '"key"=>"value"'. This is the PostgreSQL hstore format.
# For further info, see following links.
# https://github.com/bdw/GridKit/tree/master/entsoe
# https://www.postgresql.org/docs/current/hstore.html

split_regex = r'("\w+"\s*=>\s*"[^"]*"),'
tag_regex   = r'"(?P<key>\w+)"\s*=>\s*"(?P<value>[^"]*)"'  # Form: '"key"=>"value"'
tag_pattern = re.compile(tag_regex)

def tags_to_dict(row):
    tags = list(filter(None, [s.strip() for s in re.split(split_regex, row.loc['tags'])]))
    return dict(tag_pattern.match(t).groups() for t in tags)
    
dict_to_tags = lambda d: ', '.join([f'"{k}"=>"{v}"' for k,v in d.items()])

for index, tyndp_bus in existing_buses.loc[~buses.tags.isna()].iterrows():
    gridx_bus = buses.loc[index]
    
    tyndp_tags = tags_to_dict(tyndp_bus)
    gridx_tags = tags_to_dict(gridx_bus)
    conflicting_tags  = [k for k in tyndp_tags if k in gridx_tags and tyndp_tags[k] != gridx_tags[k]]

    # check if there are conflicting tags
    if conflicting_tags:
        # Edge case due to buses in different countries that are close to each other.
        # Happens e.g. between Rüthi (CH) and Meiningen (AT), which are 1km apart.
        project_id    = tyndp_tags['tyndp2020_proj_id']
        investment_id = tyndp_tags['tyndp2020_invest_id']
        s = f"Inconsistent values for keys {conflicting_tags} between " \
            f"TYNDP bus with project_id={project_id}, investment_id={investment_id} " \
            f"and its geographically closest gridextract bus with index='{index}'.\n" \
             "Adding TYNDP bus as a new bus."
        warnings.warn(s)
        new_buses.loc[index] = tyndp_bus

    # add all new tags to gridx_bus
    not_in_gridx = {k:v for k, v in tyndp_tags.items() if k not in gridx_tags}
    gridx_tags   = gridx_tags | not_in_gridx

    buses.loc[index, 'tags'] = dict_to_tags(gridx_tags)

Adding TYNDP bus as a new bus.


In [94]:
new_buses = new_buses.drop(['remarks', 'description'], axis=1)

max_index = max(map(int, buses.index))
new_index = list(map(str, range(max_index + 1, max_index + 1 + len(new_buses))))
new_buses.index = new_index

# TODO: Also need this code in PyPSA-Eur?
buses = pd.concat([buses, new_buses])
beginning_of_time = datetime.datetime.fromtimestamp(0).year
buses.loc[buses.commissioning_year.isna(), ('commissioning_year')] = beginning_of_time
buses.commissioning_year = buses.commissioning_year.astype(int)

# Lines

In [95]:
tyndp_lines = tyndp_lines.drop([
    'asset_type',
    'ac_dc',
    'substation_1',
    'substation_2',
    'investment_id',
    'p_nom_max',
    'remarks',
    'description'], axis=1)

tyndp_lines = tyndp_lines.rename(columns={
    'project_id': 'tyndp2020_proj_id',
    'specified_length_km': 'length',
    'voltage': 'v_nom',
    'status': 'tyndp_status'
})

In [96]:
# generate tags
# TODO: what should 'country' be for cross-border lines?
tag_cols = ['url', 'tyndp2020_proj_id']
create_tags = lambda row: ', '.join([f'"{key}"=>"{row[key]}"' for key in tag_cols if key in row.index])
tyndp_lines.loc[:, 'tags'] = tyndp_lines.apply(create_tags, axis=1)
tyndp_lines = tyndp_lines.drop(tag_cols, axis=1)

In [97]:
# create bus0, bus1 columns
buses_tree = spatial.KDTree(buses.loc[:, ('x', 'y')])
_, ind0 = buses_tree.query(tyndp_lines.loc[:, ('x1', 'y1')])
_, ind1 = buses_tree.query(tyndp_lines.loc[:, ('x2', 'y2')])

ind0_b = ind0 < len(buses)
ind1_b = ind1 < len(buses)

tyndp_lines.loc[ind0_b, 'bus0'] = buses.index[ind0[ind0_b]]
tyndp_lines.loc[ind1_b, 'bus1'] = buses.index[ind1[ind1_b]]

In [98]:
bus0_coords = buses.loc[tyndp_lines.bus0, ('x', 'y')]
bus1_coords = buses.loc[tyndp_lines.bus1, ('x', 'y')]

bus0_coords.columns = ['x1', 'y1']
bus1_coords.columns = ['x2', 'y2']

bus0_coords.index = tyndp_lines.index
bus1_coords.index = tyndp_lines.index

In [99]:
tyndp_lines.loc[:, ('x1', 'y1')] = bus0_coords 
tyndp_lines.loc[:, ('x2', 'y2')] = bus1_coords

In [100]:
coords_to_linestring = lambda row: str(LineString([[row.x1, row.y1], [row.x2, row.y2]]))
tyndp_lines.loc[:, 'geometry'] = tyndp_lines.apply(coords_to_linestring, axis=1)
tyndp_lines = tyndp_lines.drop(['x1', 'y1', 'x2', 'y2'], axis=1)

In [101]:
tyndp_lines.head()

Unnamed: 0,tyndp_status,length,commissioning_year,underground,v_nom,tags,bus0,bus1,geometry
1,in_permitting,131.0,2022,False,400.0,"""url""=>""https://tyndp2020-project-platform.azu...",2900,2887,"LINESTRING (-7.72338899999999 41.614416, -7.82..."
2,in_permitting,131.0,2022,False,400.0,"""url""=>""https://tyndp2020-project-platform.azu...",2887,1727,"LINESTRING (-7.82363900000001 41.554839, -8.38..."
4,in_permitting,30.0,2022,False,400.0,"""url""=>""https://tyndp2020-project-platform.azu...",2854,2861,"LINESTRING (-8.33175699999999 42.436634, -7.94..."
5,in_permitting,140.21,2022,False,400.0,"""url""=>""https://tyndp2020-project-platform.azu...",2861,2811,"LINESTRING (-7.94998199999999 42.383908, -8.63..."
6,in_permitting,140.21,2022,False,400.0,"""url""=>""https://tyndp2020-project-platform.azu...",2811,8734,"LINESTRING (-8.63250700000001 41.681118, -8.67..."


In [102]:
# set num_parallel = 1
# add line ids 

## Split into existing and new lines

In [103]:
lines_file = r'entsoegridkit/lines.csv'
lines = (pd.read_csv(lines_file, quotechar="'", true_values=['t'], false_values=['f'],
                         dtype=dict(line_id='str', bus0='str', bus1='str',
                                    underground="bool", under_construction="bool"))
             .set_index('line_id')
             .rename(columns=dict(voltage='v_nom', circuits='num_parallel')))

lines['length'] /= 1e3

lines = pd.DataFrame(lines.loc[lines.bus0.isin(buses.index) & lines.bus1.isin(buses.index)])

In [104]:
tyndp_lines_rev = tyndp_lines.rename(columns={'bus0': 'bus1', 'bus1': 'bus0'})
tyndp_undir     = pd.concat([tyndp_lines, tyndp_lines_rev])

# like JOIN based on column values
existing_lines = tyndp_undir.merge(lines.loc[:, ('bus0', 'bus1')].drop_duplicates(), how='inner')

# "reverse index" hack to get original index from 'tyndp_undir' lost through merge
row_to_ind = {tuple(row):ind for ind, row in tyndp_undir.loc[:, ('bus0', 'bus1')].iterrows()}

existing_ind = [row_to_ind[t] for t in map(tuple, existing_lines.loc[:, ('bus0', 'bus1')].values)]
new_ind      = list(set(tyndp_lines.index) - set(existing_ind))

new_lines = tyndp_lines.loc[new_ind]

In [105]:
# Update indices of 'existing_lines' to those of the corresponding line in 'lines'
row_to_ind = {tuple(row):ind for ind, row in lines.loc[:, ('bus0', 'bus1')].drop_duplicates().iterrows()}
existing_lines.loc[:, 'line_id'] = existing_lines.loc[:, ('bus0', 'bus1')].apply(lambda row: row_to_ind[tuple(row)], axis=1)
existing_lines = existing_lines.set_index('line_id')
existing_lines.head()

Unnamed: 0_level_0,tyndp_status,length,commissioning_year,underground,v_nom,tags,bus0,bus1,geometry
line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
14637,under_construction,205.0,2020,False,330.0,"""url""=>""https://tyndp2020-project-platform.azu...",6273,6282,"LINESTRING (25.166931 58.185909, 24.271545 56...."
5735,under_construction,15.0,2020,False,330.0,"""url""=>""https://tyndp2020-project-platform.azu...",6282,6283,"LINESTRING (24.271545 56.917998, 24.264679 56...."
6031,in_permitting,32.0,2024,False,380.0,"""url""=>""https://tyndp2020-project-platform.azu...",5757,5759,"LINESTRING (5.763702 52.619725, 6.144103999999..."
5123,in_permitting,35.0,2023,False,380.0,"""url""=>""https://tyndp2020-project-platform.azu...",5091,5093,"LINESTRING (4.592285 51.916321, 4.833984000000..."
8591,in_permitting,80.0,2025,False,330.0,"""url""=>""https://tyndp2020-project-platform.azu...",5547,5566,"LINESTRING (25.132599 54.591958, 25.415497 54...."


## Update existing lines

In [106]:
check_columns = ['v_nom', 'underground']

a = existing_lines.loc[:, check_columns]
b = lines.loc[existing_lines.index, check_columns]

c = a.combine_first(b)
c = c.where(c > b, b)

existing_lines.loc[:, c.columns] = c

In [107]:
split_regex = r'("\w+"\s*=>\s*"[^"]*"),'
tag_regex   = r'"(?P<key>\w+)"\s*=>\s*"(?P<value>[^"]*)"'  # Form: '"key"=>"value"'
tag_pattern = re.compile(tag_regex)

def tags_to_dict(row):
    # TODO: for buses we dealt differently with nan tags. Is this relevant here?
    if row.tags is np.nan:
        return {}
    tags = list(filter(None, [s.strip() for s in re.split(split_regex, row.loc['tags'])]))
    return dict(tag_pattern.match(t).groups() for t in tags)

for ind, row in existing_lines.iterrows():
    row.tags = tags_to_dict(lines.loc[ind]) | tags_to_dict(row)

In [108]:
# TODO: merge existing and new lines, export to csv for importing in PyPSA-Eur?

# Links

In [109]:
tyndp_links = tyndp_links.drop([
    'asset_type',
    'ac_dc',
    'substation_1',
    'substation_2',
    'remarks',
    'description'], axis=1)

tyndp_links = tyndp_links.rename(columns={
    'project_id': 'tyndp2020_proj_id',
    'investment_id': 'tyndp2020_invest_id',
    'specified_length_km': 'length',
    'voltage': 'v_nom',
    'status': 'tyndp_status',
    'p_nom_max': 'p_nom'
})
tyndp_links.head()

Unnamed: 0,x1,y1,x2,y2,tyndp_status,length,commissioning_year,underground,v_nom,p_nom,tyndp2020_invest_id,tyndp2020_proj_id,url
9,-2.880286,43.349776,-0.435896,45.069715,in_permitting,370.0,2027,True,,2000.0,38,16,https://tyndp2020-project-platform.azurewebsit...
13,14.13,42.4034,18.79211,42.320594,under_construction,445.0,2026,True,500.0,600.0,1503,28,https://tyndp2020-project-platform.azurewebsit...
20,6.72506,58.65811,9.3469,53.9233,under_construction,623.0,2020,True,500.0,,142,37,https://tyndp2020-project-platform.azurewebsit...
40,6.4581,50.8716,5.6681,50.7544,under_construction,90.0,2020,True,380.0,,146,92,https://tyndp2020-project-platform.azurewebsit...
45,-8.3195,51.9541,-4.19358,48.44426,in_permitting,500.0,2026,True,400.0,,810,107,https://tyndp2020-project-platform.azurewebsit...


In [110]:
create_tags = lambda row: ', '.join([f'"{key}"=>"{row[key]}"' for key in tag_cols if key in row.index])

def create_bus0_bus1(df, buses):
    buses_tree = spatial.KDTree(buses.loc[:, ('x', 'y')])
    _, ind0 = buses_tree.query(df.loc[:, ('x1', 'y1')])
    _, ind1 = buses_tree.query(df.loc[:, ('x2', 'y2')])

    ind0_b = ind0 < len(buses)
    ind1_b = ind1 < len(buses)

    bus0 = pd.DataFrame(buses.index[ind0[ind0_b]], index=df.index, columns=['bus0'])
    bus1 = pd.DataFrame(buses.index[ind1[ind1_b]], index=df.index, columns=['bus1'])

    return bus0.join(bus1)

In [111]:
tag_cols = ['url', 'tyndp2020_proj_id', 'tyndp2020_invest_id', 'tyndp_status']
tyndp_links.loc[:, 'tags'] = tyndp_links.apply(create_tags, axis=1)
# TODO: set as 'under_construction'
tyndp_links = tyndp_links.drop(tag_cols, axis=1)

In [112]:
# tyndp_links.loc[:, ('bus0', 'bus1')] = create_bus0_bus1(tyndp_links, buses)
# pd.DataFrame(create_bus0_bus1(tyndp_links, buses), )
tyndp_links = tyndp_links.join(create_bus0_bus1(tyndp_links, buses))

## Split into existing and new links

In [113]:
links_file = r'entsoegridkit/links.csv'
links = (pd.read_csv(links_file, quotechar="'", true_values=['t'], false_values=['f'],
                     dtype=dict(link_id='str', bus0='str', bus1='str', under_construction="bool"))
         .set_index('link_id'))
links['length'] /= 1e3
# Skagerrak Link is connected to 132kV bus which is removed in _load_buses_from_eg.
# Connect to neighboring 380kV bus
links.loc[links.bus1=='6396', 'bus1'] = '6398'
links = pd.DataFrame(links.loc[links.bus0.isin(buses.index) & links.bus1.isin(buses.index)])

In [114]:
def split_existing_new(df, og):
    df_rev   = df.rename(columns={'bus0': 'bus1', 'bus1': 'bus0'})
    df_undir = pd.concat([df, df_rev])

    # like JOIN based on column values
    existing = df_undir.merge(og.loc[:, ('bus0', 'bus1')].drop_duplicates(), how='inner')

    # "reverse index" hack to get original index from 'tyndp_undir' lost through merge
    row_to_ind = {tuple(row):ind for ind, row in df_undir.loc[:, ('bus0', 'bus1')].iterrows()}
    
    existing_ind = [row_to_ind[t] for t in map(tuple, existing.loc[:, ('bus0', 'bus1')].values)]
    new_ind      = list(set(df.index) - set(existing_ind))

    return existing, df.loc[new_ind]

In [115]:
def get_og_index(existing, og):
    row_to_ind = {tuple(row):ind for ind, row in og.loc[:, ('bus0', 'bus1')].drop_duplicates().iterrows()}
    vals = existing.loc[:, ('bus0', 'bus1')].apply(lambda row: row_to_ind[tuple(row)], axis=1).values
    og_index = pd.Index(vals, name=og.index.name)
    return og_index

In [116]:
existing_links, new_links = split_existing_new(tyndp_links, links)
existing_links.index      = get_og_index(existing_links, links)

## Update existing links

In [None]:
# merge tags 

split_regex = r'("\w+"\s*=>\s*"[^"]*"),'
tag_regex   = r'"(?P<key>\w+)"\s*=>\s*"(?P<value>[^"]*)"'  # Form: '"key"=>"value"'
tag_pattern = re.compile(tag_regex)

def tags_to_dict(row):
    # TODO: for buses we dealt differently with nan tags. Is this relevant here?
    if row.tags is np.nan:
        return {}
    tags = list(filter(None, [s.strip() for s in re.split(split_regex, row.loc['tags'])]))
    return dict(tag_pattern.match(t).groups() for t in tags)

dict_to_tags = lambda d: ', '.join([f'"{k}"=>"{v}"' for k,v in d.items()])

for ind, row in existing_links.iterrows():
    tyndp_tags = tags_to_dict(row)
    og_tags    = tags_to_dict(links.loc[ind])

    merged_tags = og_tags | tyndp_tags
    
    existing_links.loc[ind, 'tags'] = dict_to_tags(merged_tags)

# 
# # df_l and df_r must have same index
# def merge_tags(tyndp_df, og_df):
#     for ind, tyndp_row in tyndp_df.loc[~og_df.tags.isna()].iterrows():
#         og_row = og_df.loc[ind]
# 
#         tyndp_tags = tags_to_dict(tyndp_row)
#         og_tags    = tags_to_dict(og_row)
# 
#         conflicting_tags = [k for k in tyndp_tags if k in og_tags and tyndp_tags[k] != og_tags[k]]
# 
#         if conflicting_tags:
#             # TODO: save these values in tags before! 
#             project_id    = tyndp_tags['project_id']
#             investment_id = tyndp_tags['investment_id']
# 
#             s = f"Inconsistent values for keys {conflicting_tags} between " \
#                 f"TYNDP bus with project_id={project_id}, investment_id={investment_id} " \
#                 f"and its geographically closest gridextract bus with index='{ind}'.\n" \
#                  "Adding TYNDP bus as a new bus."
#             warnings.warn(s)
# 
# 
#     pass

In [None]:
def update_vals(existing, og, cols):
    a = existing.loc[:, cols]
    b = og.loc[existing.index, cols]
    c = a.combine_first(b)
    c = c.where(c > b, b)
    return c

In [None]:
# write out links to files 