In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from dotenv import load_dotenv
import json
import os
from typing import List
import geopandas as gpd

load_dotenv()

True

## TODO
1. Load Aleph entities
2. Load Mapstand data
3. Add missing geometries to mapstand, drop rows that are not used
4. Merge Mapstand with Aleph entities
5. Clean it up

In [5]:
PATH_ALEPH = os.environ.get('PATH_ALEPHDATA')
PATH_RAW = os.environ.get('PATH_RAW')

In [6]:
def parse_json(entities: List) -> pd.DataFrame:
    '''Parses Aleph JSON data
    '''
    
    entity_list = []
    
    for entity in entities:
        data = entity.get('properties')

        for key, value in data.items():
            if isinstance(value, list):
                data.update({key: ','.join(value)})
        entity_id = {'id': entity.get('id')}
        data.update(entity_id)
        entity_list.append(data)
    
    df = pd.DataFrame(entity_list)
    return df


def load_entities(path: str, entity: str) -> pd.DataFrame:
    '''Load entities from Aleph
    (downloaded through alephclient)'''

    entities = []
    with open(f'{path}{entity}.json', 'r') as file:
        for line in file:
            entities.append(json.loads(line))

    df = parse_json(entities)
    return df

## Import Aleph entities

In [7]:
companies = load_entities(PATH_ALEPH, 'companies')
assets = load_entities(PATH_ALEPH, 'assets')
assets.dropna(subset='description', inplace=True)
ownerships = load_entities(PATH_ALEPH, 'ownerships')

## Import geometries 

We have one geosjon with edited data on capacity and installation year, but it's not complete (I know...). So let's get the newest installed and planned windfarm dataset from MapStand


In [9]:
# Import geometries with edited data

ms = gpd.GeoDataFrame.from_file(PATH_RAW + 'mapstand_final.geojson', geometry='geometry')
ms = ms.to_crs(4326)

# So import newest for missing geometries

msn = gpd.GeoDataFrame.from_file(PATH_RAW + 'mapstand_final_newest_installed.geojson')
msp = gpd.GeoDataFrame.from_file(PATH_RAW + 'mapstand_final_newest_planned.geojson')

msn = pd.concat([msn, msp])

lookup = ['HIRTSHALS HARBOUR', 
          'WP Q10 / ENECO LUCHTERDUINEN', 
          'UNITECH ZEFYROS (HYWIND DEMO/KARMOY) - METCENTRE',
          'AFLANDSHAGE']

msn = msn[msn.name.isin(lookup)][:-1].copy()
msn = msn.to_crs(4326)

# Merge the missing with the right ones

ms = pd.concat([ms, msn])

# Create selection of relevant columns

cols = ['mps_uuid', 'name', 'cost_in_million', 'ppas', 'year', 'capacity_mw', 'description', 'remarks',
        'mps_est_coast_distance_km',  'installation_year', 'mps_est_elevation_max_m', 'mps_est_elevation_min_m', 'geometry']

selection = ms[cols].copy()

In [None]:
# Merge with assets

df = pd.merge(assets,
             selection,
             left_on = 'description',
             right_on='mps_uuid',
             how='outer')

In [None]:
# Clean MW

df.amount = df.amount.str.replace(' MW', '')
df.amount = df.amount.astype('float')
df['capacity_mw'] = df['capacity_mw'].fillna(df.amount)

# Clean costs
df[['amountEur', 'cost_in_million']]
df.amountEur = df.amountEur.fillna(df.cost_in_million * 1000000)

df.drop(['cost_in_million', 'amount'], axis=1, inplace=True)

In [None]:
df[df.name_x.isna()].name_y.tolist()