In [None]:
from bw2data import projects, databases, Database
from bw2io import Migration
import re

In [None]:
projects.set_current("EXIOBASE regionalized case study")

In [None]:
if "Oil seeds" in databases:
    del databases['Oil seeds']

# Extracting data

In [None]:
from bw2io.extractors import ExcelExtractor
from copy import deepcopy
from bw2io.importers.base_lci import LCIImporter

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("data/Foreground direct.xlsx", index_col=[0,1,2,3], header=[0,1,2,3,4,5,6])
df

In [None]:
# rename column levels, drop empty level
df.index.names = ["Product Compartment", "Product Name", "Product Sub-compartment", "Product Unit"]
df.columns.names = ["Process Name", "Process Unit", "Process Amount", "Delete", "Process Category", "Process Comment", "Process Type"]
df.columns = df.columns.droplevel("Delete")
df

In [None]:
# un-pivot to remove NaN values
df_long = df.melt(ignore_index=False, value_name="Exchange Amount").dropna()
df_long

In [None]:
def df_to_database(df):
    
    def split_name_and_location(string):
        if len(re.findall("/",string)) == 1 and "g/l" not in string:
            return string.split("/")[0], string.split("/")[1]
        elif "{" in string:
            return re.sub(" {.*?}", "", string), re.search("{(.*?)}", string).group(1)
        else:
            return string, "GLO"
    
    db = []
    processes = df["Process Name"].unique()
    for name in processes:
        
        # parse process metadata
        process = {"code": name}
        process["name"], process["location"] = split_name_and_location(name)               
        
        # parse exchanges
        df_ex = df.query(f"`Process Name`=='{name}'") 
        process["exchanges"] = []
        for index, row in df_ex.iterrows():
            
            # more process metadata
            if len(process["exchanges"])==0:
                process["unit"] = row["Process Unit"]
                process["category"] = row["Process Category"]
                if row["Process Comment"] != "Unnamed: 4_level_5":
                    process["comment"] = row["Process Comment"]
                process["kind"] = row["Process Type"]
                process["exchanges"].append({
                    "type":"production", 
                    "amount": row["Process Amount"], 
                    "code":process["code"],
                    "name":process["name"],
                    "location":process["location"],
                    "unit": process["unit"],
                })
            
            # exchange data
            ex = {
                "code": index[1],
                "amount": row["Exchange Amount"],
                "type": "technosphere",
                "unit": index[-1],
            }
            ex["name"], ex["location"] = split_name_and_location(index[1])
            
            # specific to biosphere exchanges
            if not pd.isna(index[0]):
                ex["categories"] = index[0]
                ex["type"] = "biosphere"
            if not pd.isna(index[2]):
                ex["categories"] = (ex["categories"], + index[2])
                
            process["exchanges"].append(ex)
            
        db.append(process)
        
    return db

db = df_to_database(df_long)
db[0]

In [None]:
df_inter = pd.read_excel("data/Foreground intermediate.xlsx", index_col=[0,1,2,3], header=[0,1,2,3,4,5,6])
df_inter

In [None]:
# rename column levels, drop empty level
df_inter.index.names = ["Product Compartment", "Product Name", "Product Sub-compartment", "Product Unit"]
df_inter.columns.names = ["Process Name", "Process Unit", "Process Amount", "Delete", "Process Category", "Process Comment", "Process Type"]
df_inter.columns = df_inter.columns.droplevel("Delete")
df_inter

# un-pivot to remove NaN values
df_long = df_inter.melt(ignore_index=False, value_name="Exchange Amount").dropna()
df_long

In [None]:
db_inter = df_to_database(df_long)
db_inter[0]

# Creating an `importer` object

In [None]:
ei = LCIImporter("Oil seeds")
ei.data = db + db_inter
del ei.strategies[0]
del ei.strategies[0]

In [None]:
from bw2io.strategies import normalize_units

In [None]:
ei.apply_strategy(normalize_units)

We have one more unit to fix - `ha a`, or the occupation of one hectare for one year. Normally we could do `ei.migrate("default-units")`, which would convert this unit to what our base flow list expects (occupation of square meter - year), but this would change all our tonnes to kilograms, meaning we would have the wrong unit to link to exiobase. So we treat this as a special case migration.

('ha a', 'square meter-year', 1e4)

In [None]:
Migration("hecatare-units").write({
    'fields': ['unit'],
    'data': [
        (
            ('ha a',),
            {'unit': 'square meter-year', 'multiplier': 1e4}
        )
    ]
}, 'Change only `ha a` units')

ei.migrate("hecatare-units")

In [None]:
{ds['unit'] for ds in ei.data}

EXIOBASE uses `Meuro` instead of `MEUR2011`, let's change this.

In [None]:
databases

In [None]:
{ds['unit'] for ds in Database("EXIOBASE 3.3.18 hybrid")}

In [None]:
Migration("MEUR2011").write({
    'fields': ['unit'],
    'data': [
        (
            # First element is input data in the order of `fields` above
            ('MEUR2011',),
            # Second element is new values to substitute
            {
                'unit': 'Meuro',
            }
        )
    ]
}, 'Change Euro unit label')

ei.migrate("MEUR2011")

In [None]:
Migration("units").write({
    'fields': ['unit'],
    'data': [
        (
            ('megajoule',),
            {'unit': 'TJ', 'multiplier': 1e-6}
        ),
        (
            ('kilowatt hour',),
            {'unit': 'TJ', 'multiplier': 3.6/1e6}
        ),
                (
            ('kilogram',),
            {'unit': 'tonnes', 'multiplier': 1e-3}
        ),
        (
            ('ton',),
            {'unit': 'tonnes'}
        ),
    ]
}, 'Change MJ, kWh, kg, ton')

ei.migrate("units")

In [None]:
{ds['unit'] for ds in ei.data}

## `database`

We can label each dataset with out chosen database name

In [None]:
from bw2io.strategies import add_database_name
from functools import partial

In [None]:
ei.apply_strategy(partial(add_database_name, name=ei.db_name))

In [None]:
db[0]

# Internal linking 

We can now apply the "generic" default strategies.

In [None]:
ei.apply_strategies()

We are ready to start thinking about internal linking. Let's see if the codes will match up, first by checking whether they are unique, and then by looking at them manually

In [None]:
dataset_codes = {ds['name'] for ds in ei.data}
exchange_codes = {exc.get('name') for ds in ei.data for exc in ds['exchanges']}
len(dataset_codes), len(exchange_codes), len(dataset_codes.intersection(exchange_codes))

In [None]:
ei.match_database(fields=['code'])

In [None]:
ei.statistics()

## `biosphere` exchanges

How many can we link without changing anything?

In [None]:
ei.match_database("biosphere3", fields=['name', 'categories'])

In [None]:
ei.statistics()

Sweet, only 3 to fix manually. Let's look at them.

In [None]:
[ex for ex in ei.unlinked if ex["type"]=="biosphere"]

`Carbon dioxide` and `Methane`, both of which need to be labeled fossil (or not) to get a match. `Occupation, arable` is a shortened version of what we have in our master flow list:

In [None]:
[x for x in Database("biosphere3") if x['name'].lower().startswith('occupation, arable')]

In [None]:
Migration("oil-bio").write({
    'fields': ['name'],
    'data': [
        (('Carbon dioxide',), {'name': 'Carbon dioxide, fossil',}),
        (('Methane',), {'name': 'Methane, fossil',}),
        (('occupation, arable',), 
         {'name': 'Occupation, arable land, unspecified use', 'categories': ('natural resource', 'land')}),
    ]
}, 'Change some biosphere flows in oil seeds foreground')

ei.migrate("oil-bio")

In [None]:
ei.match_database("biosphere3", fields=['name', 'categories'])

In [None]:
ei.statistics()

This isn't actually correct - there are still unlinked biosphere exchanges, they are just mislabelled for now :)

## `name`

Let's clean up some names, starting with this pattern: `_64 Manufacture of rubber and plastic products (25)  (product market, hybrid units)`.

In [None]:
test_string = '_64 Manufacture of rubber and plastic products (25)  (product market, hybrid units)'

In [None]:
numeric_start = re.compile("^[0-9_]\d\d")

In [None]:
numeric_start.findall(test_string)

In [None]:
test_string = '_64 Manufacture of rubber and plastic products (25)'

In [None]:
numeric_end = re.compile("\(\d\d\)$")

In [None]:
numeric_end.findall(test_string)

In [None]:
def clean_name(name):
    name = name.replace("(product market, hybrid units)", "").replace("Link to: ", "").strip()
    prefix = numeric_start.findall(name)
    if prefix:
        name = name.replace(prefix[0], '').strip()
    #suffix = numeric_end.findall(name)
    #if suffix:
    #    name = name.replace(suffix[0], '').strip()
    return name

def clean_names(data):
    for ds in data:
        ds['name'] = clean_name(ds['name'])
        for exc in ds['exchanges']:
            if exc.get('name'):
                exc['name'] = clean_name(exc['name'])
    return data

In [None]:
ei.apply_strategy(clean_names)

Change "Electricity Mix" and "Electricity Market" to "Production of electricity by gas" because 3.3.18 does not contain mixes.

In [None]:
Migration("what-is-electricity").write({
    'fields': ['name'],
    'data': [
        (('Electricity  Market',), {'name': 'Electricity mix',}),
        (('Electricity  market',), {'name': 'Electricity mix',}),
    ]
}, "Let's try being consistent, just to see how it feels")

ei.migrate("what-is-electricity")

In [None]:
ei.match_database(fields=['name', 'location', 'unit'])

In [None]:
ei.statistics()

In [None]:
[ds for ds in ei.data if "Electricity Mix" in ds["name"]]

## Linking against EXIOBASE

First try, don't expect great success.

In [None]:
ei.match_database("EXIOBASE 3.3.18 hybrid", fields=['name', 'location'])

In [None]:
ei.statistics()

In [None]:
ei.write_excel(only_unlinked=True)

## Specific issues

The flow `Arable land, as ha*year-eq.` and the activity `Link to: Market for arable land {GLO}` are removed, as we don't know where they occur, and are included to use with an indirect land use model.

In [None]:
def remove_iluc_land(data):
    EXCLUDED = ('Arable land, as ha*year-eq. (linked)', 'Market for arable land')
    data = [ds for ds in data if ds['name'] not in EXCLUDED]
    for ds in data:
        ds['exchanges'] = [exc for exc in ds['exchanges'] if exc.get('name') not in EXCLUDED]
    return data

In [None]:
ei.apply_strategy(remove_iluc_land)

Just for laughs (!?), there are some exchanges where activities where some exchanges are linked in kilograms, and others in tons. So let's fix exchanges in kilograms and switch them to tons when that will produce a link.

In [None]:
def switch_exchange_units_when_helpful(data):
    lookup = {(ds['name'], ds['unit']) for ds in data}
    
    for ds in data:
        for exc in ds['exchanges']:
            if exc.get('input'):
                continue
            if exc['unit'] == 'kilogram' and (exc['name'], 'ton') in lookup:
                exc['unit'] = 'ton'
                exc['amount'] /= 1000
    return data

In [None]:
ei.apply_strategy(switch_exchange_units_when_helpful)

In [None]:
ei.match_database(fields=['name', 'location', 'unit'])

In [None]:
ei.statistics()

Finally, we will have another problem with the electricity mixes - we have both production exchanges and technosphere exchanges we are the same thing. As their signs are flipped, they will cancel each other out, making our technosphere matrix singular:

In [None]:
import pprint

for ds in ei.data:
    if ds['name'] == 'Electricity mix' and len(ds['exchanges']) > 1:
        pprint.pprint(ds)
        break

We can just drop this duplicative technosphere exchange.

In [None]:
def drop_duplicate_production_exchange(data):
    for ds in data:
        if ds['name'] == 'Electricity mix' and len(ds['exchanges']) > 1:
            ds['exchanges'] = [exc for exc in ds['exchanges'] if exc['type'] == 'production']
    return data

In [None]:
ei.apply_strategy(drop_duplicate_production_exchange)

Finally, there are no electricity markets so use natural gas for all of them as a proxy for now.

In [None]:
Migration("electricity-mix-to-gas").write({
    'fields': ['name'],
    'data': [
        (('Electricity mix',), {'name': 'Production of electricity by gas',}),
    ]
}, "Let's try being consistent, just to see how it feels")

ei.migrate("electricity-mix-to-gas")
ei.match_database("EXIOBASE 3.3.18 hybrid", fields=['name', 'location'])

In [None]:
ei.statistics()

In [None]:
list(ei.unlinked)[0]

Drop "Treatment of shells" because it is not used anywhere.

In [None]:
ei.drop_unlinked(i_am_reckless=True)

In [None]:
ei.write_database()