In [29]:
from geopy import *
import pandas as pd
import numpy as np

import tyndp_to_pypsa

In [30]:
#### 2020 ####
# TODO: voltages given within paragraph in column 'Description of the investment'
column_semantics_2020 = {
    'Investment number': 'investment_id',
    'Commissioning Year': 'commissioning_year',
    'Status ID\n1 : Under Consideration,\n2 : In Planning but not permitting,\n3 : In permitting,\n4 : Under Construction': 'status',
    'Type of Element': 'asset_type',
    'Substation From': 'substation_1',
    'Substation To': 'substation_2',
    'Technology': 'ac_dc',
    'Total route length (km)': 'specified_length_km'
}

status_map_2020 = {
    1: 1, # under consideration
    2: 2, # planning, not permitting
    3: 3, # in permitting
    4: 4  # under construction
}
# No need to separate AC and DC as there is a separate column
# in the TYNDP data.
asset_type_map_2020 = {
    'ACTransmissionLine': 'line',
    'DCTransmissionLine': 'line',
    'OnshoreSubstation': 'substation',
    'OffshoreSubstation': 'substation',
    # 'cable' will be mapped to 'line' after adding column 'underground' of binary values.
    'OffshoreDCTransmissionCable': 'cable',
    'OffshoreACTransmissionCable': 'cable'
}

# TODO: remove this dict
params_2020 = {
    'excel': '2020/transmission.xlsx',
    'sheet': 'Trans.Investments',
    'header_row': 1,
    'status_map': status_map_2020,
    'asset_type_map': asset_type_map_2020,
    'column_semantics': column_semantics_2020
}

In [31]:
#### 2018 ####
column_semantics_2018 = {
    'Investment ID': 'investment_id',
    'ExpectedCommissioningYear': 'commissioning_year',
    'Status': 'status',
    'ElementsType': 'asset_type',
    'From': 'substation_1',
    'To': 'substation_2',
    'TechnologyType': 'ac_dc',
    'VoltageLevel (kV)': 'voltage',
    'TotalRouteLength (km)': 'specified_length_km'
}

status_map_2018 = {
    'under consideration': 1,
    'planned but not yet permitting': 2,
    'permitting': 3,
    'under construction': 4
}
asset_type_map_2018 = {
    'Overhead Line': 'line',
    'Substation': 'substation',
    'Subsea Cable': 'cable',
    'Underground Cable': 'cable'
}
params_2018 = {
    'excel': r'2018/TYNDP_2018_Project_List.xlsx',
    'sheet': 'Sheet1',
    'header_row': 0,
    'status_map': status_map_2018,
    'asset_type_map': asset_type_map_2018,
    'column_semantics': column_semantics_2018
}

In [32]:
#### 2016 ####
column_semantics_2016 = {
    'TYNDP 2016 Investment Index': 'investment_id',
    'TYNDP 2016 expected commissioning date': 'commissioning_year',
    'TYNDP 2016 status name': 'status',
    'Element type': 'asset_type',
    'From Substation name': 'substation_1',
    'To Substation name': 'substation_2',
    'AC or DC': 'ac_dc',
    'Voltage Level': 'voltage',
    'Line length [km]': 'specified_length_km'
}
# TODO: commissioned, planning, design
# TODO: rather specify rows we want to keep? E.g. via list
# TODO: map values to strings (semantics!), then choose only those that are 'in permitting' or 'under construction'
status_map_2016 = {
    'under consideration': 1,
    'Design & Permitting': 2,
    'permitting': 3,
    'under construction': 4
}
asset_type_map_2016 = {
    'Overhead Line': 'line',
    'Substation': 'substation',
    'Subsea Cable': 'cable',
    'Underground Cable': 'cable'
}

params_2016 = {
    'excel': r'2016/TYNDP2016_Projects data.xlsx',
    'sheet': 'Investments',
    'header_row': 0,
    'status_map': status_map_2016,
    'asset_type_map': asset_type_map_2016,
    'column_semantics': column_semantics_2016
}

In [33]:
#### 2014 ####
# TODO: Problems
# - no length column: either not specified or given within paragraph of text
# - no asset type column. But if only substation 1 is specified it's a substation. If both are, it's a line.


In [34]:
#### 2012 ####
# TODO: Same problems as with 2014

In [35]:
#### 2010 ####
# TODO: Same problems as with 2014

In [36]:
params = params_2016
wanted = tyndp_to_pypsa.prepare_tyndp_data(params['excel'], params['sheet'], status_map_2016, asset_type_map_2016, column_semantics_2016)
wanted.head()

Unnamed: 0,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,voltage,specified_length_km,underground
1,474.0,31/ 12/ 2020,3,substation,Ribeira de Pena (PT),,,400.0,,False
2,476.0,31/ 12/ 2020,3,line,V. P. Aguiar (by Carrapatelo),Estarreja (by Carrapatelo),AC,220.0,121.7,False
3,941.0,31/ 12/ 2022,3,substation,Fridão,,,400.0,,False
5,2.0,31/ 12/ 2022,3,line,Pedralva (PT),Sobrado (PT),AC,400.0,66.5,False
6,3.0,31/ 12/ 2016,4,line,Pedralva (PT),Ponte de Lima (PT),AC,400.0,37.6,False


In [37]:
lines     = wanted.loc[wanted['asset_type'] == 'line']
new_subst = wanted.loc[wanted['asset_type'] == 'substation']
lines     = lines.query("substation_1 not in @new_subst")
lines     = lines.query("substation_2 not in @new_subst")

if params == params_2020:
    # TODO: temporary hack.
    lines['voltage'] = 380

# Use bus names from buses.csv (v0.1.0)
See https://github.com/PyPSA/pypsa-eur/blob/v0.1.0rc/data/entsoegridkit/buses.csv. Data is from 2017 (newer gridkit extracts do not contain 'tags' with substation names).

In [38]:
curated_buses = tyndp_to_pypsa.extract_name_country()
curated_buses.head()

Unnamed: 0,name,country,x,y
0,Jeddah,SA,39.716949,21.985075
1,Shoaibah,SA,39.726562,21.66381
2,Jeddah,SA,39.711456,22.038549
3,Makkah,SA,40.303345,22.093275
4,Al Mukarramah,SA,40.403595,21.945593


## There are substations which share the same name but have different coordinates
- large deviation between coordinates => substations are most likely in different countries 
    - BUT: it does occur that different places in the same country get the same name
- small deviation between coordinates => reference to same substation (error in gridextract?)

In [39]:
# TODO: added 'NI' although Northern Ireland probably appears in PyPSA as 'GB'. Find a better solution.
pypsa_countries = ['AL', 'AT', 'BA', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'NI', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK']

### List of all duplicates

In [40]:
duplicated = curated_buses.loc[curated_buses.name.duplicated()]
duplicated = duplicated.query("country in @pypsa_countries")

# for name in duplicated.name.unique():
#     print(name)
#     for index, row in curated_buses.query('name == @name').iterrows():
#         print(f"({row['x']}, {row['y']}), {row['country']}")
#     print('----')

### Same name and country, large deviations

In [41]:
curated_buses.query("name == 'Yuzhnaya'")

Unnamed: 0,name,country,x,y
2720,Yuzhnaya,RU,44.817352,48.155093
3851,Yuzhnaya,RU,50.674438,52.002638
3905,Yuzhnaya,RU,36.268616,51.642737
3927,Yuzhnaya,RU,38.685608,51.843414
5378,Yuzhnaya,RU,59.824677,56.576128


### Same name, different country, large deviation

In [42]:
curated_buses.query("name == 'Saida'")

Unnamed: 0,name,country,x,y
617,Saida,LB,35.400696,33.587167
833,Saida,DZ,0.146942,34.908458


In [43]:
curated_buses.query("name == 'Titan'")

Unnamed: 0,name,country,x,y
1986,Titan,AL,19.786377,41.619549
2825,Titan,UA,33.767853,46.195993
5825,Titan,RU,34.026031,67.451763


## (TODO) Add new substations

In [44]:
# on_subst

# extract country if it matches regex
# otherwise, np.NAN

## Remove '(\<Country Code\>) ' from tyndp substation name strings, add new column instead
Otherwise, this could negatively impact the Levenshtein distance.

In [45]:
lines = tyndp_to_pypsa.prepare_substation_names(lines)
lines.head()

Unnamed: 0,investment_id,commissioning_year,status,asset_type,substation_1,substation_2,ac_dc,voltage,specified_length_km,underground,country_1,country_2
2,476.0,31/ 12/ 2020,3,line,V. P. Aguiar,Estarreja,AC,220.0,121.7,False,,
5,2.0,31/ 12/ 2022,3,line,Pedralva,Sobrado,AC,400.0,66.5,False,PT,PT
6,3.0,31/ 12/ 2016,4,line,Pedralva,Ponte de Lima,AC,400.0,37.6,False,PT,PT
7,4.0,31/ 12/ 2021,3,line,V.Minho,Feira,AC,400.0,132.0,False,,
9,478.0,31/ 12/ 2019,3,line,Penela,Paraimo / Batalha,AC,400.0,15.0,False,PT,PT


## create mapping from all unique tyndp substation names to substation names from 'buses'

In [46]:
tyndp_to_bus = tyndp_to_pypsa.tyndp_to_substation(lines, curated_buses)

In [47]:
# a, b = 'Turleenan', 'Guillena'
# a, b = 'Pyhanselka', 'Pyhänselkä'
# a, b = 'Tuomela B', 'Tudela'
# a, b =  'Heviz (HU) \\ Zerjavinec', 'Žerjavinec'
# Levenshtein.distance(a.lower(), b.lower())

# Match start- and endpoints of lines to substations from buses.csv

In [48]:
results = tyndp_to_pypsa.match_tyndp_with_buses(lines, tyndp_to_bus, curated_buses)
results.head()

Unnamed: 0,s1,x1,y1,s2,x2,y2,coord_dist_km,commissioning_year,status,ac_dc,voltage,underground
7,V. Minho,-7.761841,41.58258,Feira,-8.368835,40.953974,96.475112,31/ 12/ 2021,3,AC,400.0,False
21,Beariz,-8.242493,42.375793,Fontefria,-8.393555,42.1746,27.754446,1/ 1/ 2017,3,AC,400.0,False
29,Grande-Ile,6.04248,45.463983,Piossasco,7.322388,44.837369,157.590522,1/ 1/ 2019,4,DC,320.0,True
65,Bjæverskov,11.961365,55.445374,Bentwisch,12.273102,54.067448,153.907505,1/ 1/ 2018,4,AC,220.0,True
70,Audorf,9.759979,54.305307,Kassø,9.295807,55.074436,98.827047,31/ 12/ 2020,3,AC,400.0,False


In [49]:
# TODO: only join with result to compare original location and substation names with inferred
# result = lines.copy()
# result = result.join(coordinates)

percentage = results.index.size / lines.index.size
print(f'{percentage * 100}% of lines are probably correct.')

# print('Lines where we probably found the correct coordinates:')
# result.loc[~result.s1.isna()]

51.470588235294116% of lines are probably correct.


# Determine remaining coordinates using geopy

In [50]:
error_lines = lines.loc[~lines.index.isin(results.index)]

In [51]:
geopy_results = tyndp_to_pypsa.match_tyndp_with_geopy(error_lines)
geopy_results.head()

Unnamed: 0,s1,x1,y1,s2,x2,y2,coord_dist_km,commissioning_year,status,ac_dc,voltage,underground
5,Pedralva,41.5599,-8.321,Sobrados,41.2407,-7.6181,68.642054,31/ 12/ 2022,3,AC,400.0,False
6,Pedralva,41.5599,-8.321,Ponte de Lima,41.7675,-8.5831,31.751471,31/ 12/ 2016,4,AC,400.0,False
12,Seia,40.4201,-7.70281,Penela,40.0294,-8.39024,72.837954,31/ 12/ 2020,3,AC,400.0,False
15,Fontefría,42.4762,-7.7897,Vila Nova de Famalicão,41.4079,-8.5198,133.210461,31/ 12/ 2018,3,AC,400.0,False
48,Villanova,43.7403,12.934,Obala Donja Lastva,42.446,18.6869,489.839413,31/ 12/ 2019,4,DC,500.0,True


In [52]:
results = results.append(geopy_results)

percentage = results.index.size / lines.index.size
print(f'{percentage * 100}% of lines are probably correct.')

79.41176470588235% of lines are probably correct.


# Map commissioning dates to yearly values

In [53]:
results['commissioning_year'] = tyndp_to_pypsa.commissioning_dates_to_year(results['commissioning_year'])

In [54]:
results

Unnamed: 0,s1,x1,y1,s2,x2,y2,coord_dist_km,commissioning_year,status,ac_dc,voltage,underground
7,V. Minho,-7.761841,41.58258,Feira,-8.368835,40.953974,96.475112,2022,3,AC,400.0,False
21,Beariz,-8.242493,42.375793,Fontefria,-8.393555,42.1746,27.754446,2017,3,AC,400.0,False
29,Grande-Ile,6.04248,45.463983,Piossasco,7.322388,44.837369,157.590522,2019,4,DC,320.0,True
65,Bjæverskov,11.961365,55.445374,Bentwisch,12.273102,54.067448,153.907505,2018,4,AC,220.0,True
70,Audorf,9.759979,54.305307,Kassø,9.295807,55.074436,98.827047,2021,3,AC,400.0,False
85,Isar,12.292328,48.659222,St. Peter,13.08197,48.25577,97.731905,2020,3,AC,400.0,False
86,St. Peter,13.08197,48.25577,Tauern,12.671356,47.342545,109.032907,2021,3,AC,400.0,False
107,Endrup,8.849487,55.48897,Eemshaven,6.811523,53.346452,326.54087,2019,4,DC,320.0,True
113,Horta,3.574677,51.015483,Stevin,3.100891,51.285112,60.354006,2018,4,AC,380.0,False
137,Vierraden,14.252014,53.089076,Krajnik,14.441528,53.164887,22.506552,2019,4,AC,400.0,False
