In [2]:
from geopy.geocoders import Nominatim
from geopy import distance
import pandas as pd
import numpy as np
import math
import re
from Levenshtein import *

In [3]:
excel = 'transmission.xlsx'
sheet = 'Trans.Investments'

df = pd.read_excel(excel, sheet_name=sheet)

# TODO: might not work in later versions, use sanity check.
df.columns = df.iloc[0]
df         = df.drop(index=df.index[0])

# TODO: also contains new substations!

wanted_columns = ['Investment number',
                  'This investment belongs to project number…',
                  'Commissioning Year',
                  'Status ID\n1 : Under Consideration,\n2 : In Planning but not permitting,\n3 : In permitting,\n4 : Under Construction',
                  'Type of Element',
                  'Substation From',
                  'Substation To',
                  'Technology',
                  'Total route length (km)']
df[wanted_columns]

Unnamed: 0,Investment number,This investment belongs to project number…,Commissioning Year,"Status ID\n1 : Under Consideration,\n2 : In Planning but not permitting,\n3 : In permitting,\n4 : Under Construction",Type of Element,Substation From,Substation To,Technology,Total route length (km)
1,2,1,2024,2,ACTransmissionLine,Pedralva (PT),Sobrado (PT),AC,67
2,4,1,2022,3,ACTransmissionLine,V.Minho (by Ribeira de Pena),Feira (by Ribeira de Pena),AC,131
3,474,1,2021,3,OnshoreSubstation,Ribeira de Pena (PT),-,AC,0
4,18,4,2022,3,ACTransmissionLine,Beariz (ES),Fontefria (ES),AC,30
5,496,4,2022,3,ACTransmissionLine,Fontefria (ES),Vila Nova de Famalicão (PT) (By Ponte de Lima),AC,140.21
...,...,...,...,...,...,...,...,...,...
317,1747,1077,2036,1,DCTransmissionLine,Thessaloniki,Dubrovo,DC,110
318,1748,1077,2036,1,DCTransmissionLine,Dubrovo,Ch. Mogila,DC,170
319,1745,1081,2036,1,OffshoreDCTransmissionCable,Tobruk,Arachtos,DC,1070
320,1749,1081,2036,1,DCTransmissionLine,Arachtos,Elbasan,DC,180


In [4]:
wanted   = df[wanted_columns]
ac_lines = wanted.loc[wanted['Type of Element'] == 'ACTransmissionLine']
ac_lines = ac_lines.reindex(columns=ac_lines.columns.tolist() + ['x1', 'y1', 'x2', 'y2', 'coord_distance'])
# ac_lines = ac_lines.loc[]

In [51]:
ac_lines.loc[ac_lines['Status ID%'] >= 3]

KeyError: 'Status ID%'

# Use bus names from buses.csv (v0.1.0)

In [88]:
buses_file = 'buses_v0.1.0.csv'

# see base_network.py in PyPSA-Eur repository
buses = (pd.read_csv(buses_file, quotechar="'",
                     true_values='t', false_values='f',
                     dtype=dict(bus_id="str"))
        .set_index("bus_id")
        .drop(['station_id'], axis=1)
        .rename(columns=dict(voltage='v_nom')))

In [89]:
no_tags = buses[buses['tags'].isna()].index.size
print(f'{no_tags} buses have no tags.')

yes_tags = buses[~buses['tags'].isna()].index.size
print(f'{yes_tags} buses have tags.')

1238 buses have no tags.
6773 buses have tags.


In [90]:
buses = buses.loc[~buses['tags'].isna()]
buses = buses.loc[buses.symbol == 'Substation']

## Extract 'name_eng' and 'country' from tags in  buses

In [146]:
split_regex = r'("\w+"=>"[^"]*"),'

tag_regex   = r'"(?P<key>\w+)"=>"(?P<value>[^"]*)"'
tag_pattern = re.compile(tag_regex)

rows = []

for index, row in buses.iterrows():
    name    = ''
    country = ''
    x = row['x']
    y = row['y']
    
    tags_string = row['tags']
    
    tags = re.split(split_regex, tags_string)
    
    # Remove whitespaces at front and end, remove None values
    tags = [s.strip() for s in tags]
    tags = list(filter(None, tags))
    
    for tag in tags:
        m = tag_pattern.match(tag)
            
        if m is None:
            print(tag)
            
        # see group names in tag_regex
        key   = m.group('key')
        value = m.group('value')
        
        if key == 'name_eng':
            name = value.strip()
        elif key == 'country':
            country = value.strip()
    
    if name == 'unknown' or not name:
        continue
        
    rows.append((name, country, x, y))

In [147]:
curated_buses = pd.DataFrame.from_records(rows, columns=['name', 'country', 'x', 'y'])

### Remove duplicate rows

In [148]:
curated_buses = curated_buses.loc[~curated_buses.duplicated()]

### There are substation names which appear more than once with different coordinates

In [150]:
curated_buses.loc[curated_buses.name.duplicated()]

Unnamed: 0,name,country,x,y
2,Jeddah,SA,39.711456,22.038549
9,Al Madina,SA,39.508209,24.718143
281,I. Baroud,EG,30.890808,30.855079
321,Local,LY,24.134216,31.960318
580,Sahab,JO,36.478729,32.261588
...,...,...,...,...
5474,Luch,RU,36.441650,55.939202
5476,Yartsevo,RU,37.613068,56.526169
5784,Severnaya,RU,57.594452,65.232554
5825,Titan,RU,34.026031,67.451763


In [152]:
curated_buses.loc[curated_buses['name'] == 'Yartsevo']

Unnamed: 0,name,country,x,y
5462,Yartsevo,RU,37.657013,56.398705
5476,Yartsevo,RU,37.613068,56.526169


## create mapping from all unique tyndp substation names to substation names from 'buses'

In [94]:
tyndp_subs   = set(ac_lines['Substation From']).union(set(ac_lines['Substation To']))
tyndp_to_bus = {}

for tyndp in tyndp_subs:
    buses_subs = curated_buses.name.values
    
    closest = min([(bus, distance(bus, tyndp)) for bus in buses_subs], key=lambda t: t[1])[0]
    
    tyndp_to_bus[tyndp] = closest

In [95]:
tyndp_to_bus

{'SS Sándorfalva': 'Sándorfalva',
 'Altomonte (IT)': 'Altomonte',
 'Creys (FR)': 'Creys',
 'Djerdap': 'Djerdap 1',
 'Oldstreet 400kV': 'Oldstreet',
 'Ekhyddan (SE)': 'Ekudden',
 'Zell-Ziller (AT)': 'Zell/Ziller',
 'Visegrad (BA)': 'Visegrad',
 'Konjsko (HR)': 'Konjsko',
 'Conneforde (DE)': 'Conneforde',
 'Sajoivanka (HU)': 'Sajóivánka',
 'Massenhoven': 'Massenhoven',
 'Kilingi-Nomme (EE)': 'Kilingi-Nomme',
 'Darbenai': 'Darna',
 'Chamoson': 'Chamoson',
 'Tsirguliina ': 'Tsirguliina',
 'Bajina Basta (RS)': 'Bajina Basta',
 'Center (TBD)': 'Centenario',
 'Vilnius': 'Vilnius',
 'Beariz (ES)': 'Beariz',
 'SS Subotica 3': 'Subotica',
 'SS Srbobran': 'Srbobran',
 'Wahle (DE)': 'Wahle',
 'Cernavoda (RO)': 'Craiova Nord',
 'Baczyna': 'Baczyna',
 'Montalto di Castro (IT) Terna station 400 kV AC': 'Montezic poste et centrale',
 'Razbojiste': 'La Boisse',
 'Resita (RO)': 'Relizane',
 'Wullenstetten (DE)': 'Eichstetten',
 'Ladce (SK)': 'Lac Noir',
 'Klixbüll (DE)': 'Tripoli SE',
 'Woodland (IE)': 

In [64]:
for index, row in ac_lines.iteritems():
    
    
    
    break

Investment number


In [9]:
fr = ac_lines.iloc[0]['Substation From']
to = ac_lines.iloc[0]['Substation To']

# Determine coordinates using geopy

In [94]:
locator = Nominatim(user_agent='esm_group')
geocode = RateLimiter(locator.geocode, min_delay_seconds=0.05)

In [95]:
x1 = []
y1 = []
x2 = []
y2 = []

error_rows = []

for index, row in ac_lines.iterrows():
    fr   = row['Substation From']
    to   = row['Substation To']
    dist = row['Total route length (km)']

    fr_loc = geocode(fr)
    to_loc = geocode(to)

    if fr_loc is None or to_loc is None:
        error_rows.append([row.values])
        continue
        
    fr_coords  = fr_loc.latitude, fr_loc.longitude
    to_coords  = to_loc.latitude, to_loc.longitude
    coord_dist = distance.distance(fr_coords, to_coords).km

    if math.isclose(coord_dist, dist, rel_tol=0.25):
        x1.append(fr_coords[0])
        y1.append(fr_coords[1])
        
        x2.append(to_coords[0])
        y2.append(to_coords[1])
    else:
        error_rows.append([row.values])
        # x1.append(np.NAN)
        # y1.append(np.NAN)
        
        # x2.append(np.NAN)
        # y2.append(np.NAN)

In [99]:
len(x1), len(error_rows)

(55, 122)

In [92]:
error_rows

[array([4, 1, '2022', 3, 'ACTransmissionLine',
        'V.Minho (by Ribeira de Pena)', 'Feira (by Ribeira de Pena)', 'AC',
        131, nan, nan, nan, nan, nan], dtype=object),
 array([496, 4, '2022', 3, 'ACTransmissionLine', 'Fontefria (ES)',
        'Vila Nova de Famalicão (PT) (By Ponte de Lima)', 'AC', 140.21,
        nan, nan, nan, nan, nan], dtype=object),
 array([60, 23, '2022', 4, 'ACTransmissionLine', 'Avelin/Mastaing (FR)',
        'Horta (BE)', 'AC', 80, nan, nan, nan, nan, nan], dtype=object)]