# 06: Postprocess data

In this script, we postprocess the raw LCIA impact data:
- split off production volumes
- Filter out failed computations
- convert to equal product units (MJ)
- add corresponding IAM regions
- average using production volumes
- combine production and combustion impacts in the case of liquid fuels and hydrogen + gases
- calculate indirect impacts

In [1]:
%run common_definitions.py

In [2]:
import pandas as pd
import numpy as np
import xarray as xr

from premise.geomap import Geomap

In [3]:
output_fp = "../output/" + BW_PROJECTNAME

### Load impact file

In [4]:
impacts = pd.read_csv(output_fp+"/impacts_energy_provision.csv")
direct_impacts = pd.read_csv(output_fp + "/direct_impacts_energy_provision.csv")

### Split off production volumes, fill missing values

In [5]:
prod_volumes = impacts.copy().set_index(["short name", "sector", "scenario", "year", "location"])["production volume"]

To allow weighting by production volumes, we fill missing values:

- In the case of the overall average, we fill with ones in cases where values are missing or zero for all locations.
- In the case of the IAM region averages, we fill with ones in cases where values are missing or zero for all locations within a IAM region

In [6]:
geo = Geomap(model="remind")

prod_volumes = prod_volumes.reset_index()
prod_volumes["IAM region"] = prod_volumes["location"].apply(geo.ecoinvent_to_iam_location)
prod_volumes.set_index(["short name", "sector", "scenario", "year", "location"], inplace=True)

In [7]:
def count_valid(x):
    return np.count_nonzero(np.nan_to_num(x))

In [8]:
nan_counts = prod_volumes.reset_index().groupby(["short name", "sector", "scenario", "year"]).agg({
    "production volume": count_valid
})
fill_idx = []
compare_idx = nan_counts[nan_counts["production volume"] == 0].index
for i, idx in enumerate(prod_volumes.reset_index().set_index(["short name", "sector", "scenario", "year"]).index):
    if idx in compare_idx:
        fill_idx.append(i)

s1 = prod_volumes["production volume"].copy()
s1.iloc[fill_idx] = 1.0

In [9]:
nan_counts = prod_volumes.reset_index().groupby(["short name", "sector", "scenario", "year", "IAM region"]).agg({
    "production volume": count_valid
})
fill_idx = []
compare_idx = nan_counts[nan_counts["production volume"] == 0].index
for i, idx in enumerate(prod_volumes.reset_index().set_index(["short name", "sector", "scenario", "year", "IAM region"]).index):
    if idx in compare_idx:
        fill_idx.append(i)

s2 = prod_volumes["production volume"].copy()
s2.iloc[fill_idx] = 1.0

In [10]:
prod_volumes["production volume (for overall average)"] = s1
prod_volumes["production volume (for IAM region averages)"] = s2

Fill the rest with zeros.

In [11]:
prod_volumes.fillna(0, inplace=True)
prod_volumes.to_csv(output_fp + "/production_volumes_energy_provision.csv")

### Filter out failed computations (only for full impacts)

In [12]:
impacts.drop(["amount", "unit", "ecoinvent name", "production volume"], axis=1, inplace=True)
direct_impacts.drop(["amount", "unit", "ecoinvent name"], axis=1, inplace=True)

impacts.set_index(["short name", "product", "sector", "scenario", "year", "location"], inplace=True)

failed = impacts.loc[~impacts.any(axis=1), :]
impacts.drop(failed.index, inplace=True)
impacts.reset_index(inplace=True)

### Convert to impacts per MJ

In [13]:
NCVs = pd.read_csv("../data/NCVs_v2.csv")
NCVs["conversion factor"] = np.where(~np.isnan(NCVs["NCV in MJ/product"]), 1/NCVs["NCV in MJ/product"],
                                     np.where(~np.isnan(NCVs["GCV in MJ/product"]), 1/NCVs["GCV in MJ/product"], 1))
conv_factors = dict(zip(list(NCVs["product name"]), list(NCVs["conversion factor"])))
conversion_factors = impacts["product"].apply(lambda x: conv_factors[x]).to_numpy()
conversion_factors_direct = direct_impacts["product"].apply(lambda x: conv_factors[x]).to_numpy()

impacts.set_index(["short name", "product", "sector", "scenario", "year", "location"], inplace=True)
direct_impacts.set_index(["short name", "product", "sector", "scenario", "year", "location"], inplace=True)
conversion_factors = pd.Series(conversion_factors, impacts.index)
conversion_factors_direct = pd.Series(conversion_factors_direct, direct_impacts.index)

rescaled_impacts = impacts.mul(conversion_factors, axis=0)
rescaled_direct_impacts = direct_impacts.mul(conversion_factors_direct, axis=0)

rescaled_impacts = rescaled_impacts.droplevel("product", axis=0)
rescaled_direct_impacts = rescaled_direct_impacts.droplevel("product", axis=0)

### Filter out outliers

In [14]:
idx2remove = [idx for idx in rescaled_impacts.index if
              idx[0] == "hydro, reservoir, alpine region" and
              idx[-1] == "PE"]

rescaled_impacts.drop(idx2remove, axis=0, inplace=True)

idx2remove = [idx for idx in rescaled_direct_impacts.index if
              idx[0] == "hydro, reservoir, alpine region" and
              idx[-1] == "PE"]

rescaled_direct_impacts.drop(idx2remove, axis=0, inplace=True)

### Combine with combustion impacts, save

In [15]:
mapping = pd.read_csv("../../mappings/technology_selection_energy_provision_v4.csv")
short2long = dict(zip(list(mapping["short name"]), list(mapping["ecoinvent name"])))

In [16]:
combustion_impacts1 = pd.read_csv(output_fp+"/impacts_liquids_combustion.csv").set_index(["ecoinvent name"])

data = []
sel = rescaled_impacts.copy().reset_index()
sel = sel[sel["sector"] == "liquids"]
for tech in sel["short name"]:
    data.append(combustion_impacts1.loc[short2long[tech]].to_numpy())

sel.set_index(["short name", "scenario", "year", "location"], inplace=True)

rescaled_combustion_impacts1 = pd.DataFrame(
    np.array(data),
    index=sel.index,
    columns=combustion_impacts1.columns
)

rescaled_combustion_impacts1.reset_index(inplace=True)
rescaled_combustion_impacts1["sector"] = "liquids"

In [17]:
combustion_impacts2 = pd.read_csv(output_fp+"/impacts_hydrogen_and_gases_combustion.csv").set_index(["ecoinvent name"])

data = []
sel = rescaled_impacts.copy().reset_index()
sel = sel[sel["sector"] == "hydrogen and gases"]
for tech in sel["short name"]:
    data.append(combustion_impacts2.loc[short2long[tech]].to_numpy())

sel.set_index(["short name", "scenario", "year", "location"], inplace=True)

rescaled_combustion_impacts2 = pd.DataFrame(
    np.array(data),
    index=sel.index,
    columns=combustion_impacts2.columns
)

rescaled_combustion_impacts2.reset_index(inplace=True)
rescaled_combustion_impacts2["sector"] = "hydrogen and gases"

In [18]:
rescaled_combustion_impacts1.set_index(
    ["short name", "sector", "scenario", "year", "location"],
    inplace=True
)
rescaled_combustion_impacts2.set_index(
    ["short name", "sector", "scenario", "year", "location"],
    inplace=True
)
rescaled_combustion_impacts = pd.concat([rescaled_combustion_impacts1, rescaled_combustion_impacts2])
# rescaled_combustion_impacts.set_index(
#     ["short name", "sector", "scenario", "year", "location"],
#     inplace=True
# )
combined_impacts = rescaled_impacts.add(rescaled_combustion_impacts, fill_value=0)
combined_impacts.to_csv(output_fp+"/impacts_combined.csv")

### Collect all direct impacts

In [19]:
all_rescaled_direct_impacts = pd.concat([rescaled_direct_impacts, rescaled_combustion_impacts])
all_rescaled_direct_impacts.to_csv(output_fp+"/direct_impacts.csv")

### Calculate indirect impacts

In [20]:
rescaled_impacts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,"CML v4.8 2016, acidification, acidification (incl. fate, average Europe total, A&B)","CML v4.8 2016, energy resources: non-renewable, abiotic depletion potential (ADP): fossil fuels","CML v4.8 2016, eutrophication, eutrophication (fate not incl.)","CML v4.8 2016, material resources: metals/minerals, abiotic depletion potential (ADP): elements (ultimate reserves)","CML v4.8 2016, ozone depletion, ozone layer depletion (ODP steady state)","CML v4.8 2016, photochemical oxidant formation, photochemical oxidation (high NOx)","EDIP 2003, acidification, acidification","EDIP 2003, eutrophication, combined potential","EDIP 2003, eutrophication, terrestrial eutrophication","EDIP 2003, photochemical ozone formation, impacts on human health",...,"ReCiPe Midpoint (H) V1.13, freshwater eutrophication, FEP","ReCiPe Midpoint (H) V1.13, natural land transformation, NLTP","ReCiPe Midpoint (H) V1.13, marine eutrophication, MEP","ReCiPe Midpoint (H) V1.13, terrestrial ecotoxicity, TETPinf","ReCiPe Midpoint (H) V1.13, fossil depletion, FDP","ReCiPe Midpoint (H) V1.13, ionising radiation, IRP_HE","ReCiPe Midpoint (H) V1.13, marine ecotoxicity, METPinf","IPCC 2021, climate change: biogenic, GWP 100a","IPCC 2021, climate change: land use, GWP 100a","IPCC 2021, climate change: fossil, GWP 100a"
short name,sector,scenario,year,location,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
BIGCC,electricity production,SSP2-NPi,2020,CAZ,0.000082,0.123604,0.000022,9.670381e-08,1.288430e-10,0.000005,0.001383,0.000210,0.003318,0.000017,...,1.174916e-06,-1.683832e-06,0.000006,0.000006,0.003250,0.000146,0.000799,0.027251,0.000005,0.009109
BIGCC,electricity production,SSP2-NPi,2020,CHA,0.000088,0.153080,0.000024,1.291742e-07,2.112767e-10,0.000006,0.001486,0.000226,0.003439,0.000018,...,1.493817e-06,-2.384227e-06,0.000006,0.000007,0.003998,0.000196,0.000868,0.027368,0.000007,0.011280
BIGCC,electricity production,SSP2-NPi,2020,MEA,0.000086,0.140215,0.000023,1.124183e-07,1.892625e-10,0.000006,0.001446,0.000218,0.003390,0.000018,...,1.317372e-06,-2.016068e-06,0.000006,0.000007,0.003658,0.000165,0.000835,0.027355,0.000006,0.010288
BIGCC,electricity production,SSP2-NPi,2020,IND,0.000090,0.149496,0.000024,1.158459e-07,2.209305e-10,0.000006,0.001531,0.000235,0.003473,0.000018,...,1.744286e-06,-2.049031e-06,0.000006,0.000007,0.003904,0.000229,0.000850,0.025892,0.000008,0.011168
BIGCC,electricity production,SSP2-NPi,2020,LAM,0.000083,0.126498,0.000024,9.155718e-08,2.601316e-10,0.000006,0.001407,0.000229,0.003392,0.000018,...,1.716482e-06,-1.705773e-06,0.000006,0.000006,0.003363,0.000413,0.000797,0.019208,0.000016,0.009151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
diffusion adsorption heat pump,residential heating,SSP2-PkBudg500,2050,LAM,0.000028,0.793124,0.000007,1.216552e-07,1.777680e-09,0.000004,0.000451,0.000060,0.000700,0.000010,...,8.216877e-07,-3.081645e-07,0.000001,0.000001,0.024345,0.000092,0.000334,0.004215,0.000038,0.047557
diffusion adsorption heat pump,residential heating,SSP2-PkBudg500,2050,CAZ,0.000024,0.737421,0.000008,1.236524e-07,2.164374e-10,0.000004,0.000393,0.000061,0.000709,0.000012,...,8.871347e-07,-1.251459e-06,0.000001,0.000001,0.022649,0.000213,0.000279,0.002251,0.000032,0.047020
diffusion adsorption heat pump,residential heating,SSP2-PkBudg500,2050,CHA,0.000029,0.714821,0.000008,1.549300e-07,1.065885e-09,0.000004,0.000466,0.000067,0.000723,0.000010,...,1.037424e-06,-3.680880e-07,0.000002,0.000001,0.021943,0.000538,0.000398,0.000988,0.000005,0.046198
diffusion adsorption heat pump,residential heating,SSP2-PkBudg500,2050,IND,0.000029,0.715060,0.000008,1.596781e-07,1.069586e-09,0.000004,0.000472,0.000068,0.000725,0.000010,...,1.061556e-06,-4.249465e-07,0.000002,0.000001,0.021950,0.000870,0.000409,0.001017,0.000008,0.046222


In [21]:
rescaled_indirect_impacts = combined_impacts.sub(all_rescaled_direct_impacts)
rescaled_indirect_impacts.to_csv(output_fp+"/indirect_impacts.csv")

### Add IAM locations, calculate several regional averages

In [22]:
def grouped_weighted_avg(values, weights, weights_col, by):
    A = (values.mul(weights[weights_col], axis=0)).reset_index().groupby(by).sum(numeric_only=True)
    b = weights.reset_index().groupby(by).sum(numeric_only=True)
    return A.div(b[weights_col], axis=0)

In [23]:
geo = Geomap(model="remind")

combined_impacts.reset_index(inplace=True)
all_rescaled_direct_impacts.reset_index(inplace=True)
rescaled_indirect_impacts.reset_index(inplace=True)
prod_volumes = prod_volumes.reset_index()

combined_impacts["IAM region"] = combined_impacts["location"].apply(geo.ecoinvent_to_iam_location)
all_rescaled_direct_impacts["IAM region"] = all_rescaled_direct_impacts["location"].apply(geo.ecoinvent_to_iam_location)
rescaled_indirect_impacts["IAM region"] = rescaled_indirect_impacts["location"].apply(geo.ecoinvent_to_iam_location)
prod_volumes["IAM region"] = prod_volumes["location"].apply(geo.ecoinvent_to_iam_location)

combined_impacts.set_index(["short name", "sector", "scenario", "year", "IAM region", "location"], inplace=True)
all_rescaled_direct_impacts.set_index(["short name", "sector", "scenario", "year", "IAM region", "location"], inplace=True)
rescaled_indirect_impacts.set_index(["short name", "sector", "scenario", "year", "IAM region", "location"], inplace=True)
prod_volumes.set_index(["short name", "sector", "scenario", "year", "IAM region", "location"], inplace=True)

In [24]:
overall_average = grouped_weighted_avg(combined_impacts, prod_volumes, "production volume (for overall average)",
                                       by=["short name", "sector", "scenario", "year"])
overall_average["regional average"] = "overall"
overall_average_direct = grouped_weighted_avg(all_rescaled_direct_impacts, prod_volumes, "production volume (for overall average)",
                                       by=["short name", "sector", "scenario", "year"])
overall_average_direct["regional average"] = "overall"
overall_average_indirect = grouped_weighted_avg(rescaled_indirect_impacts, prod_volumes, "production volume (for overall average)",
                                       by=["short name", "sector", "scenario", "year"])
overall_average_indirect["regional average"] = "overall"

For the average by IAM regions, we need to adapt the production volumes in the case of only NaN 

In [25]:
# full impacts
iam_average = grouped_weighted_avg(combined_impacts, prod_volumes, "production volume (for IAM region averages)",
                                   by=["short name", "sector", "scenario", "year", "IAM region"])
iam_average.reset_index(inplace=True)

world_average = iam_average.groupby(["short name", "sector", "scenario", "year"]).mean(numeric_only=True)
world_average["regional average"] = "World"

iam_average = iam_average[iam_average["IAM region"].isin(REMIND_REGIONS)]
iam_average.rename(columns={"IAM region": "regional average"}, inplace=True)
iam_average.set_index(["short name", "sector", "scenario", "year", "regional average"], inplace=True)

# direct impacts
iam_average_direct = grouped_weighted_avg(all_rescaled_direct_impacts, prod_volumes, "production volume (for IAM region averages)",
                                   by=["short name", "sector", "scenario", "year", "IAM region"])
iam_average_direct.reset_index(inplace=True)

world_average_direct = iam_average_direct.groupby(["short name", "sector", "scenario", "year"]).mean(numeric_only=True)
world_average_direct["regional average"] = "World"

iam_average_direct = iam_average_direct[iam_average_direct["IAM region"].isin(REMIND_REGIONS)]
iam_average_direct.rename(columns={"IAM region": "regional average"}, inplace=True)
iam_average_direct.set_index(["short name", "sector", "scenario", "year", "regional average"], inplace=True)

# indirect impacts
iam_average_indirect = grouped_weighted_avg(rescaled_indirect_impacts, prod_volumes, "production volume (for IAM region averages)",
                                   by=["short name", "sector", "scenario", "year", "IAM region"])
iam_average_indirect.reset_index(inplace=True)

world_average_indirect = iam_average_indirect.groupby(["short name", "sector", "scenario", "year"]).mean(numeric_only=True)
world_average_indirect["regional average"] = "World"

iam_average_indirect = iam_average_indirect[iam_average_indirect["IAM region"].isin(REMIND_REGIONS)]
iam_average_indirect.rename(columns={"IAM region": "regional average"}, inplace=True)
iam_average_indirect.set_index(["short name", "sector", "scenario", "year", "regional average"], inplace=True)

In [26]:
# full impacts
pd.concat(
    [
        overall_average.reset_index(),
        iam_average.reset_index(),
        world_average.reset_index(),
    ],
    axis=0
).set_index(
    ["short name", "sector", "scenario", "year", "regional average"]
).sort_index().to_csv(output_fp + "/impacts_regional_averages.csv")

# direct impacts
pd.concat(
    [
        overall_average_direct.reset_index(),
        iam_average_direct.reset_index(),
        world_average_direct.reset_index(),
    ],
    axis=0
).set_index(
    ["short name", "sector", "scenario", "year", "regional average"]
).sort_index().to_csv(output_fp + "/direct_impacts_regional_averages.csv")

# indirect impacts
pd.concat(
    [
        overall_average_indirect.reset_index(),
        iam_average_indirect.reset_index(),
        world_average_indirect.reset_index(),
    ],
    axis=0
).set_index(
    ["short name", "sector", "scenario", "year", "regional average"]
).sort_index().to_csv(output_fp + "/indirect_impacts_regional_averages.csv")