In [1]:
# from geopy.geocoders import Nominatim
from geopy import *
import geopy.distance
import pandas as pd
import numpy as np
import math
import itertools
import re
import Levenshtein

In [2]:
# TODO: what about voltage level?
# TODO: somehow map statuses to some sort of enum
# TODO: map column names to fixed names (see example csv)

#### 2020 ####
column_semantics_2020 = {
    'Investment number': 'investment_id',
    'Commissioning Year': 'commissioning_year',
    'Status ID\n1 : Under Consideration,\n2 : In Planning but not permitting,\n3 : In permitting,\n4 : Under Construction': 'status',
    'Type of Element': 'asset_type',
    'Substation From': 'substation_1',
    'Substation To': 'substation_2',
    'Technology': 'ac_dc',
    'Total route length (km)': 'specified_length_km'
}

status_map_2020 = {
    1: 1, # under consideration
    2: 2, # planning, not permitting
    3: 3, # in permitting
    4: 4  # under construction
}
# TODO: 'OffshoreDCTransmissionCable', 'OffshoreACTransmissionCable'
asset_type_map_2020 = {
    'ACTransmissionLine': 'line',
    'DCTransmissionLine': 'line',
    'OnshoreSubstation': 'substation',
    'OffshoreSubstation': 'substation'
}

params_2020 = {
    'excel': '2020/transmission.xlsx',
    'sheet': 'Trans.Investments',
    'header_row': 1,
    'status_column': 'Status ID\n1 : Under Consideration,\n2 : In Planning but not permitting,\n3 : In permitting,\n4 : Under Construction',
    'status_map': status_map_2020,
    'asset_type_map': asset_type_map_2020,
    'column_semantics': column_semantics_2020
}

In [3]:
#### 2018 ####
column_semantics_2018 = {
    'Investment ID': 'investment_id',
    'ExpectedCommissioningYear': 'commissioning_year',
    'Status': 'status',
    'ElementsType': 'asset_type',
    'From': 'substation_1',
    'To': 'substation_2',
    'TechnologyType': 'ac_dc',
    'TotalRouteLength (km)': 'specified_length_km'
}

status_map_2018 = {
    'under consideration': 1,
    'planned but not yet permitting': 2,
    'permitting': 3,
    'under construction': 4
}
asset_type_map_2018 = {
    'Overhead Line': 'line',
    'Substation': 'substation'
}

params_2018 = {
    'excel': r'2018/TYNDP_2018_Project_List.xlsx',
    'sheet': 'Sheet1',
    'header_row': 0,
    'status_column': 'Status',
    'status_map': status_map_2018,
    'asset_type_map': asset_type_map_2018,
    'column_semantics': column_semantics_2018
}

In [4]:
#### 2016 ####
column_semantics_2016 = {
    'TYNDP 2016 Investment Index': 'investment_id',
    'TYNDP 2016 expected commissioning date': 'commissioning_year',
    'TYNDP 2016 status name': 'status',
    'Element type': 'asset_type',
    'From Substation name': 'substation_1',
    'To Substation name': 'substation_2',
    'AC or DC': 'ac_dc',
    'Line length [km]': 'specified_length_km'
}
# TODO: commissioned, planning, design
# TODO: rather specify rows we want to keep? E.g. via list
# TODO: map values to strings (semantics!), then choose only those that are permitting or under construction
status_map_2016 = {
    'under consideration': 1,
    'Design & Permitting': 2,
    'permitting': 3,
    'under construction': 4
}
asset_type_map_2016 = {
    'Overhead Line': 'line',
    'Substation': 'substation'
}

params_2016 = {
    'excel': r'2016/TYNDP2016_Projects data.xlsx',
    'sheet': 'Investments',
    'header_row': 0,
    'status_column': 'Status',
    'status_map': status_map_2016,
    'asset_type_map': asset_type_map_2016,
    'column_semantics': column_semantics_2016
}

In [5]:
#### 2014 ####
# TODO: Problems
# - no length column: either not specified or given within paragraph of text
# - no asset type column. But if only substation 1 is specified it's a substation. If both are, it's a line.


In [6]:
#### 2012 ####
# TODO: Same problems as with 2014

In [7]:
#### 2010 ####
# TODO: Same problems as with 2014

In [8]:
params = params_2020

In [9]:
excel = params['excel']
sheet = params['sheet']

column_semantics = params['column_semantics']

wanted_columns = column_semantics.keys()
status_column  = [k for (k,v) in column_semantics.items() if v == 'status'][0]
status_map     = params['status_map']
asset_type_map = params['asset_type_map']
header_row     = params['header_row']

wanted = pd.read_excel(excel, sheet_name=sheet, header=header_row)[wanted_columns]

# map columns to specified names (-> consistency & semantics)
wanted.columns = [column_semantics[c] for c in wanted.columns]

if wanted['status'].dtype == pd.StringDtype:
    wanted['status'] = wanted['status'].str.lower()

# replace status with numerical values as specified in status_map
wanted = wanted.loc[wanted['status'].isin(status_map.keys())]
wanted = wanted.replace({'status': status_map})

# only choose those in permitting or under construction
wanted = wanted.loc[wanted['status'].astype(int) >= 3]

wanted = wanted.loc[wanted['asset_type'].isin(asset_type_map.keys())]
wanted = wanted.replace({'asset_type': asset_type_map})

wanted.head()

Unnamed: 0,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,specified_length_km
1,4,2022,3,line,V.Minho (by Ribeira de Pena),Feira (by Ribeira de Pena),AC,131.0
2,474,2021,3,substation,Ribeira de Pena (PT),-,AC,0.0
3,18,2022,3,line,Beariz (ES),Fontefria (ES),AC,30.0
4,496,2022,3,line,Fontefria (ES),Vila Nova de Famalicão (PT) (By Ponte de Lima),AC,140.21
6,499,2022,3,substation,Beariz (ES),Beariz (ES),AC,0.0


In [23]:
lines     = wanted.loc[wanted['asset_type'] == 'line']
new_subst = wanted.loc[wanted['asset_type'] == 'substation']
lines     = lines.query("substation_1 not in @new_subst")
lines     = lines.query("substation_2 not in @new_subst")

# Use bus names from buses.csv (v0.1.0)
See https://github.com/PyPSA/pypsa-eur/blob/v0.1.0rc/data/entsoegridkit/buses.csv. Data is from 2017 (newer gridkit extracts do not contain 'tags' with substation names).

In [60]:
buses_file = 'buses_v0.1.0.csv'

# see base_network.py in PyPSA-Eur repository
buses = (pd.read_csv(buses_file, quotechar="'",
                     true_values='t', false_values='f',
                     dtype=dict(bus_id="str"))
        .set_index("bus_id")
        .drop(['station_id'], axis=1)
        .rename(columns=dict(voltage='v_nom')))

In [61]:
buses = buses.query('tags.notnull()', engine='python')
buses = buses.query("symbol == 'Substation'")

# Extract 'name_eng' and 'country' from tags in  buses

In [62]:
split_regex = r'("\w+"=>"[^"]*"),' # Form: 'key => value, key => value, ...'

tag_regex   = r'"(?P<key>\w+)"=>"(?P<value>[^"]*)"' # Form: 'key => value'
tag_pattern = re.compile(tag_regex)

rows = []

for index, row in buses.iterrows():
    name    = ''
    country = ''
    x = row['x']
    y = row['y']
    
    tags_string = row['tags']
    
    tags = re.split(split_regex, tags_string)
    
    # Remove whitespaces at front and end, remove None values
    tags = [s.strip() for s in tags]
    tags = list(filter(None, tags))
    
    for tag in tags:
        m = tag_pattern.match(tag)
            
        if m is None:
            print(tag)
            
        # see group names in tag_regex
        key   = m.group('key')
        value = m.group('value')
        
        if key == 'name_eng':
            name = value.strip()
        elif key == 'country':
            country = value.strip()
    
    if name == 'unknown' or not name:
        continue
        
    rows.append((name, country, x, y))

In [63]:
curated_buses = pd.DataFrame.from_records(rows, columns=['name', 'country', 'x', 'y'])

## Remove duplicate rows

In [64]:
curated_buses = curated_buses.loc[~curated_buses.duplicated()]

## There are substations which share the same name but have different coordinates
- large deviation between coordinates => substations are most likely in different countries 
    - BUT: it does occur that different places in the same country get the same name
- small deviation between coordinates => reference to same substation (error in gridextract?)

In [65]:
# TODO: added 'NI' although Northern Ireland probably appears in PyPSA as 'GB'. Find a better solution.
pypsa_countries = ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NI', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']

### List of all duplicates

In [66]:
duplicated = curated_buses.loc[curated_buses.name.duplicated()]
duplicated = duplicated.query("country in @pypsa_countries")

# for name in duplicated.name.unique():
#     print(name)
#     for index, row in curated_buses.query('name == @name').iterrows():
#         print(f"({row['x']}, {row['y']}), {row['country']}")
#     print('----')

### Same name and country, large deviations

In [67]:
curated_buses.query("name == 'Yuzhnaya'")

Unnamed: 0,name,country,x,y
2720,Yuzhnaya,RU,44.817352,48.155093
3851,Yuzhnaya,RU,50.674438,52.002638
3905,Yuzhnaya,RU,36.268616,51.642737
3927,Yuzhnaya,RU,38.685608,51.843414
5378,Yuzhnaya,RU,59.824677,56.576128


### Same name, different country, large deviation

In [68]:
curated_buses.query("name == 'Saida'")

Unnamed: 0,name,country,x,y
617,Saida,LB,35.400696,33.587167
833,Saida,DZ,0.146942,34.908458


In [69]:
curated_buses.query("name == 'Titan'")

Unnamed: 0,name,country,x,y
1986,Titan,AL,19.786377,41.619549
2825,Titan,UA,33.767853,46.195993
5825,Titan,RU,34.026031,67.451763


## (TODO) Add new substations

In [70]:
# on_subst

# extract country if it matches regex
# otherwise, np.NAN

## Remove '(\<Country Code\>) ' from tyndp substation name strings, add new column instead
Otherwise, this could negatively impact the Levenshtein distance.

In [71]:
subst_regex   = r'(?P<place>.+)\s?[\[(](?P<country>\w{2})[)\]]' # Form: 'Glorenza (IT)'
subst_pattern = re.compile(subst_regex)

# TODO: does it make sense to "throw away" information here?
# use this if other pattern does not match to remove comments in parentheses
# e.g. 'Molai (through Sklavouna Terminal)'
alt_regex   = r'(?P<place>.+)\s?[\[(].*[)\]]'
alt_pattern = re.compile(alt_regex)

fr_names     = []
fr_countries = []
to_names     = []
to_countries = []

for index, row in lines.iterrows():    
    fr = row['substation_1']
    to = row['substation_2']
    
    # default values if regex does not match
    fr_name = fr
    to_name = to    
    fr_country = np.NAN
    to_country = np.NAN
    
    fr_match = subst_pattern.match(fr)
    to_match = subst_pattern.match(to)
    
    if fr_match:
        fr_name    = fr_match.group('place').strip()
        fr_country = fr_match.group('country').strip()
    else:
        fr_alt_match = alt_pattern.match(fr)
        if fr_alt_match:
            fr_name = fr_alt_match.group('place')
        
    if to_match:
        to_name    = to_match.group('place').strip()
        to_country = to_match.group('country').strip()
    else:        
        to_alt_match = alt_pattern.match(to)
        if to_alt_match:
            to_name = to_alt_match.group('place')
    
    fr_names.append(fr_name)
    fr_countries.append(fr_country)
    to_names.append(to_name)
    to_countries.append(to_country)

In [72]:
lines['substation_1'] = fr_names
lines['substation_2'] = to_names
lines['country_1'] = fr_countries
lines['country_2'] = to_countries
lines.head()

Unnamed: 0,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,specified_length_km,country_1,country_2
1,4,2022,3,line,V.Minho,Feira,AC,131.0,,
3,18,2022,3,line,Beariz,Fontefria,AC,30.0,ES,ES
4,496,2022,3,line,Fontefria,Vila Nova de Famalicão,AC,140.21,ES,PT
9,60,2022,4,line,Avelin/Mastaing,Horta,AC,80.0,FR,BE
10,614,2023,4,line,Nauders,Glorenza,AC,26.0,AT,IT


## create mapping from all unique tyndp substation names to substation names from 'buses'

In [73]:
tyndp_subs   = set(lines['substation_1']).union(set(lines['substation_2']))
tyndp_to_bus = {}

for tyndp in tyndp_subs:
    buses_subs = curated_buses.name.values
    
    closest = min([(bus, Levenshtein.distance(bus.lower(), tyndp.lower())) for bus in buses_subs], key=lambda t: t[1])[0]
    print()
    
    tyndp_to_bus[tyndp] = closest

























































































































































































In [74]:
# a, b = 'Turleenan', 'Guillena'
# a, b = 'Pyhanselka', 'Pyhänselkä'
# a, b = 'Tuomela B', 'Tudela'
# a, b =  'Heviz (HU) \\ Zerjavinec', 'Žerjavinec'
# Levenshtein.distance(a.lower(), b.lower())

In [75]:
# tyndp_to_bus

### Helper functions: Out of all possible pairs of locations from two lists, get the pair whose distance is closest to the specified (line) length
Deals with problem of multiple places in same country sharing a name.

In [76]:
# TODO: keep name!
def extract_coords(rows):
    coordinates = []
    for _, row in rows.iterrows():
        coordinates.append((row['x'], row['y']))
    return coordinates

In [77]:
def match_pair_with_length(s1_rows, s2_rows, length):
    s1_coords = extract_coords(s1_rows)
    s2_coords = extract_coords(s2_rows)
    
    combinations  = list(itertools.product(s1_coords, s2_coords))
    with_distance = [(a, b, geopy.distance.distance(a,b).km) for (a,b) in combinations]
    
    best_match = min(with_distance, key=lambda t: abs(length - t[2]))
    return best_match

# Match start- and endpoints of lines to substations from buses.csv

In [78]:
fr_to_tuples  = {}
error_rows = {}

for index, row in lines.iterrows():
    # TODO: should we keep the original name here?
    fr = row['substation_1']
    to = row['substation_2']
    
    fr_country = row['country_1']
    to_country = row['country_2']
            
    s1 = tyndp_to_bus[fr]
    s2 = tyndp_to_bus[to]
    
    # Extract respective rows in buses to determine coordinates
    buses_s1 = curated_buses.loc[curated_buses.name == s1]
    buses_s2 = curated_buses.loc[curated_buses.name == s2]
    
    # If we were able to extract country from name, restrict chosen rows to this country.
    if not pd.isna(fr_country):
        buses_s1 = buses_s1.loc[buses_s1['country'] == fr_country]
    if not pd.isna(to_country):
        buses_s2 = buses_s2.loc[buses_s2['country'] == to_country]
    
    if buses_s1.empty or buses_s2.empty:
        error_rows[index] = row
        continue
    
    # Choose pair which matches length best
    length = row['specified_length_km']
    (x1, y1), (x2, y2), coord_dist = match_pair_with_length(buses_s1, buses_s2, length)
        
    tpl = (s1, x1, y1, s2, x2, y2, coord_dist)
    
    # TODO: how to choose an appropriate tolerance?
    if math.isclose(coord_dist, length, rel_tol=0.45):
        fr_to_tuples[index] = tpl
    else:
        error_rows[index] = row

In [79]:
coordinates = pd.DataFrame(index=fr_to_tuples.keys(), data=fr_to_tuples.values(), columns=['s1', 'x1', 'y1', 's2', 'x2', 'y2', 'coord_dist'])

result = lines.copy()
result = result.join(coordinates)

percentage = coordinates.index.size / lines.index.size
print(f'{percentage * 100}% of lines are probably correct.')

# print('Lines where we probably found the correct coordinates:')
# result.loc[~result.s1.isna()]

53.333333333333336% of lines are probably correct.


In [80]:
error_lines = result.loc[result.s1.isna()]
error_subst = set(error_lines['substation_1']).union(error_lines['substation_2'])

# print('')
# {(k,tyndp_to_bus[k]) for k in error_subst}

# Determine coordinates using geopy

In [81]:
def match_pair_with_length_geopy(s1_locations, s2_locations, length):
    s1_first_name = s1_locations[0][0]
    s2_first_name = s2_locations[0][0]

    # Only take locations which at least include name of the first location in list (assumption: best name-based match).
    s1_locations = [l for l in s1_locations if s1_first_name in l[0]]
    s2_locations = [l for l in s2_locations if s2_first_name in l[0]]

    return match_coord_pairs_with_length(s1_locations, s2_locations, length)

In [82]:
def lat_lon(loc):
    return (loc.latitude, loc.longitude)

def match_coord_pairs_with_length(s1_coords, s2_coords, length):
    combinations  = list(itertools.product(s1_coords, s2_coords))
    with_distance = [(a, b, geopy.distance.distance(lat_lon(a),lat_lon(b)).km) for (a,b) in combinations]
    
    best_match = min(with_distance, key=lambda t: abs(length - t[2]))
    return best_match

In [83]:
# locator = Nominatim(user_agent='esm_group')
# geocode = RateLimiter(locator.geocode, min_delay_seconds=0.01)
locator = AlgoliaPlaces(user_agent='esm_group')
geocode = locator.geocode

In [84]:
fr_to_tuples_geopy = {}
error_tuples_geopy = {}

for index, row in error_lines.iterrows():
    fr   = row['substation_1']
    to   = row['substation_2']
    dist = row['specified_length_km']

    fr_country = row['country_1']
    to_country = row['country_2']

    # TODO: is it possible to get several matching locations?
    fr_locs = geocode(fr, exactly_one=False) if pd.isna(fr_country) else geocode(fr, exactly_one=False, countries=[fr_country])
    to_locs = geocode(to, exactly_one=False) if pd.isna(to_country) else geocode(to, exactly_one=False, countries=[to_country])
    
    if fr_locs is None or to_locs is None:
        continue
        
    (s1, (x1, y1)), (s2, (x2, y2)), coord_dist = match_pair_with_length_geopy(fr_locs, to_locs, dist)
    tpl = (s1, x1, y1, s2, x2, y2, coord_dist)

    if not math.isclose(coord_dist, dist, rel_tol=0.45):
        error_tuples_geopy[index] = tpl
    else:
        fr_to_tuples_geopy[index] = tpl

In [85]:
coordinates_geopy = pd.DataFrame(index=fr_to_tuples_geopy.keys(), data=fr_to_tuples_geopy.values(), columns=['s1', 'x1', 'y1', 's2', 'x2', 'y2', 'coord_dist'])

coordinates = coordinates.append(coordinates_geopy)

result = lines.copy()
result = result.join(coordinates)

percentage = coordinates.index.size / lines.index.size
print(f'{percentage * 100}% of lines are probably correct.')

print('Lines where we probably found the correct coordinates:')
result.loc[~result.s1.isna()]

77.14285714285715% of lines are probably correct.
Lines where we probably found the correct coordinates:


Unnamed: 0,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,specified_length_km,country_1,country_2,s1,x1,y1,s2,x2,y2,coord_dist
1,4,2022,3,line,V.Minho,Feira,AC,131.00,,,V. Minho,-7.761841,41.582580,Feira,-8.368835,40.953974,96.475112
3,18,2022,3,line,Beariz,Fontefria,AC,30.00,ES,ES,Beariz,-8.242493,42.375793,Fontefria,-8.393555,42.174600,27.754446
4,496,2022,3,line,Fontefria,Vila Nova de Famalicão,AC,140.21,ES,PT,Fontefría,42.476200,-7.789700,Vila Nova de Famalicão,41.407900,-8.519800,133.210461
9,60,2022,4,line,Avelin/Mastaing,Horta,AC,80.00,FR,BE,Mastaing,3.353577,50.316531,Horta,3.574677,51.015483,81.423032
10,614,2023,4,line,Nauders,Glorenza,AC,26.00,AT,IT,Nauders,46.891700,10.502600,Glurns - Glorenza,46.671400,10.553900,24.801352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,1686,2024,4,line,Wahle,Mecklar,AC,210.00,DE,DE,Wahle,10.453491,52.258071,Mecklar,9.621277,50.859710,178.809584
284,1688,2028,3,line,Nuojuankangas,Huutokoski,AC,290.00,,,Isokangas,25.694275,65.279688,Huutokoski,27.766571,62.262171,377.925106
294,1716,2020,4,line,Chania I,Molai,AC,179.00,,,Chania International Airport,35.539800,24.140200,Μολάοι,36.804300,22.853300,181.908109
303,645,2027,3,line,Laino,Altomonte,AC,50.00,IT,IT,Laino,15.948029,40.078071,Altomonte,16.020813,39.778991,33.011910


In [86]:
error_coordinates_geopy = pd.DataFrame(index=error_tuples_geopy.keys(), data=error_tuples_geopy.values(), columns=['s1', 'x1', 'y1', 's2', 'x2', 'y2', 'coord_dist'])


In [87]:
error_coordinates_geopy.join(lines)

Unnamed: 0,s1,x1,y1,s2,x2,y2,coord_dist,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,specified_length_km,country_1,country_2
24,Gabčíkovo,47.8951,17.5781,Gönyű,47.7331,17.8279,25.969975,1500,2020,4,line,Gabcikovo,Gonyu,AC,110.0,SK,HU
28,Riga,56.946,24.1059,Riga,56.946,24.1059,0.0,1062,2020,4,line,Riga CHP2,Riga HPP,AC,15.0,LV,LV
45,Keminmaa,65.803,24.5209,Messure Crescent,43.491,-79.8845,6286.587625,396,2025,3,line,Keminmaa,Messaure,AC,200.0,,
69,Via Pizzocchera Salgareda,45.7093,12.5183,Beričevo,46.0883,14.6085,167.58379,616,2028,3,line,Salgareda,Bericevo,DC,0.0,IT,SI
72,Vilnius kalea/Calle Vilnius,42.845,-2.65328,Néris-les-Bains,46.2879,2.66137,569.512881,382,2025,3,line,Vilnius,Neris,AC,80.0,,
76,Bălți,47.7631,27.9293,Tartupis,54.5966,23.838,811.93419,1012,2023,3,line,Balti,Tartu,AC,168.0,,
77,Virú,-8.41429,-78.7522,Tsirguliina,57.864,26.1947,11677.913915,1013,2025,3,line,Viru,Tsirguliina,AC,243.0,,
86,Bitėnai,55.1518,25.3803,Nong Khae,14.3406,100.867,7788.976319,1634,2025,3,line,Bitenai,KHAE,AC,234.0,,
103,Güterweg Seyring,48.2387,13.4455,Zayante,37.0919,-122.044,9578.538532,886,2022,4,line,Seyring,Zaya,AC,60.0,,
201,Liefkenshoek,51.1365,4.7596,Mercatorstraße,50.2902,6.0877,132.895031,604,2025,3,line,Liefkenshoek,Mercator,AC,19.0,BE,BE


In [88]:
# TODO: plot error tuples and matches separately.
# TODO: write out data to csv

In [89]:
import dateutil.parser

remove_whitespace = lambda s: s.replace(' ', '')
to_year           = lambda d: dateutil.parser.parse(remove_whitespace(d)).year
result['commissioning_year'].apply(to_year)

1      2022
3      2022
4      2022
9      2022
10     2023
       ... 
294    2020
295    2022
303    2027
304    2030
307    2022
Name: commissioning_year, Length: 105, dtype: int64

In [90]:
# Some problematic formats from excel files
format_1 = r'\d{4}-(\d{4})' # e.g. '2022-2023'
format_2 = r'\d{1},\d{3}' # e.g. '2,022'