# NGFS Clean & Interp

### 01/29/24, Erica Simon
## Purpose: prepare IAM projections of future emissions under different policy scenarios
- Appropriate for harmonization
- Interpolated to annual intervals
- Containing all necessary forcing agents required by FaIR


Data Credit: 
- Richters, O. *et al.* (2023). *NGFS Climate Scenarios Data Set* (4.1). Zenodo. https://doi.org/10.5281/ZENODO.10079020





In [1]:
import numpy as np
import pandas as pd

## Import historical emissions dataset 

In [2]:
hist_emis = pd.read_csv('~/outputs/hist_emis_ALL.csv')
hist_emis.head()

Unnamed: 0,Model,Scenario,Region,Variable,Unit,1750,1751,1752,1753,1754,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Historical,GCP+CEDS+PRIMAP+GFED,World,Emissions|BC,Mt BC/yr,2.096766,2.071972,2.067178,2.070382,2.098586,...,7.842259,7.934828,7.926371,7.56806,7.562319,7.459095,7.781842,6.525021,6.871947,6.701702
1,Historical,GCP+CEDS+PRIMAP+GFED,World,Emissions|C2F6,kt C2F6/yr,0.0,0.0,0.0,0.0,0.0,...,1.003458,1.035565,0.933936,0.979945,0.99698,0.996764,1.062577,1.10655,1.160588,1.167145
2,Historical,GCP+CEDS+PRIMAP+GFED,World,Emissions|C3F8,kt C3F8/yr,0.0,0.0,0.0,0.0,0.0,...,0.314138,0.323186,0.304124,0.319107,0.324656,0.324586,0.346019,0.36034,0.377938,0.380074
3,Historical,GCP+CEDS+PRIMAP+GFED,World,Emissions|C4F10,kt C4F10/yr,0.0,0.0,0.0,0.0,0.0,...,0.070672,0.070641,0.067024,0.070327,0.07155,0.071536,0.07626,0.079417,0.083292,0.083759
4,Historical,GCP+CEDS+PRIMAP+GFED,World,Emissions|C5F12,kt C5F12/yr,0.0,0.0,0.0,0.0,0.0,...,0.035588,0.031141,0.03003,0.031513,0.032065,0.032062,0.034184,0.035603,0.037338,0.037545


## Import & clean future emissions datasets
MESSAGE and GCAM outputs from NGFS

In [3]:
def NGFS_clean(df):
    
    # remove final row- does not contain emission data
    df = df.drop(index=df.index[-1])
    
    # rename to match FaIR spceies
    df.loc[df.Variable == 'Emissions|CO2|AFOLU', 'Variable'] = 'Emissions|CO2 AFOLU'
    df.loc[df.Variable == 'Emissions|CO2|Energy and Industrial Processes', 'Variable'] = 'Emissions|CO2 FFI'
    df.loc[df.Variable == 'Emissions|HFC|HFC125', 'Variable'] = 'Emissions|HFC-125'
    df.loc[df.Variable == 'Emissions|HFC|HFC134a', 'Variable'] = 'Emissions|HFC-134a'
    df.loc[df.Variable == 'Emissions|HFC|HFC227ea', 'Variable'] = 'Emissions|HFC-227ea'
    df.loc[df.Variable == 'Emissions|HFC|HFC143a', 'Variable'] = 'Emissions|HFC-143a'
    df.loc[df.Variable == 'Emissions|HFC|HFC23', 'Variable'] = 'Emissions|HFC-23'
    df.loc[df.Variable == 'Emissions|HFC|HFC245fa', 'Variable'] = 'Emissions|HFC-245fa'
    df.loc[df.Variable == 'Emissions|HFC|HFC32', 'Variable'] = 'Emissions|HFC-32'
    
    # get rid of species not included in hist dataset (F-Gases, HFC, PFC)
    df = df[~df['Variable'].isin(['Emissions|F-Gases', 'Emissions|HFC', 'Emissions|PFC', 'Emissions|CO2'])]
    
    # remove '(version: 1)' from scenario names
    for i in range(len(df.Scenario.values)): 
        df.Scenario.values[i] = df.Scenario.values[i][:-13]

    # rename scenarios for clarity
    df.loc[df.Scenario == 'Below 2?C', 'Scenario'] = 'Below 2 C'
    df.loc[df.Scenario == 'Nationally Determined Contributions (NDCs)', 'Scenario'] = 'NDCs'
    
    return df

In [4]:
# read in df
MSG_proj_emis = pd.read_csv('~/inputs/NGFS_MESSAGE.csv')
GCAM_proj_emis = pd.read_csv('~/inputs/NGFS_GCAM.csv')
REM_proj_emis = pd.read_csv('~/inputs/NGFS_REMIND.csv')

# clean data
MSG_proj_emis = NGFS_clean(MSG_proj_emis)
GCAM_proj_emis = NGFS_clean(GCAM_proj_emis)
REM_proj_emis = NGFS_clean(REM_proj_emis)
    
# rename to match FaIR spceies
MSG_proj_emis.loc[MSG_proj_emis.Variable == 'Emissions|HFC|HFC43-10', 'Variable'] = 'Emissions|HFC-4310mee'
MSG_proj_emis.loc[MSG_proj_emis.Variable == 'Emissions|HFC-4310mee', 'Unit'] = 'kt HFC4310mee/yr'

## Include missing species

Ensure that variable names in projected dataset are consistent with those of future dataset. Also identify missing species from projected dataset.

In [5]:
a = hist_emis['Variable'].unique()
b = GCAM_proj_emis['Variable'].unique()
c = MSG_proj_emis['Variable'].unique()
d = REM_proj_emis['Variable'].unique()

GCAM_missing = np.setdiff1d(a, GCAM_proj_emis['Variable'].unique())
MSG_missing = np.setdiff1d(a, MSG_proj_emis['Variable'].unique())
REM_missing = np.setdiff1d(a, REM_proj_emis['Variable'].unique())

print('missing species:')
print('\nGCAM')
print(GCAM_missing)
print('\nMESSAGE')
print(MSG_missing)
print('\nREMIND')
print(REM_missing)

missing species:

GCAM
['Emissions|C3F8' 'Emissions|C4F10' 'Emissions|C5F12' 'Emissions|C6F14'
 'Emissions|C7F16' 'Emissions|C8F18' 'Emissions|CCl4' 'Emissions|CFC-11'
 'Emissions|CFC-113' 'Emissions|CFC-114' 'Emissions|CFC-115'
 'Emissions|CFC-12' 'Emissions|CH2Cl2' 'Emissions|CH3Br'
 'Emissions|CH3CCl3' 'Emissions|CH3Cl' 'Emissions|CHCl3'
 'Emissions|HCFC-141b' 'Emissions|HCFC-142b' 'Emissions|HCFC-22'
 'Emissions|HFC-152a' 'Emissions|HFC-236fa' 'Emissions|HFC-365mfc'
 'Emissions|HFC-4310mee' 'Emissions|Halon-1211' 'Emissions|Halon-1301'
 'Emissions|Halon-2402' 'Emissions|NF3' 'Emissions|SO2F2'
 'Emissions|c-C4F8']

MESSAGE
['Emissions|C2F6' 'Emissions|C3F8' 'Emissions|C4F10' 'Emissions|C5F12'
 'Emissions|C6F14' 'Emissions|C7F16' 'Emissions|C8F18' 'Emissions|CCl4'
 'Emissions|CF4' 'Emissions|CFC-11' 'Emissions|CFC-113'
 'Emissions|CFC-114' 'Emissions|CFC-115' 'Emissions|CFC-12'
 'Emissions|CH2Cl2' 'Emissions|CH3Br' 'Emissions|CH3CCl3'
 'Emissions|CH3Cl' 'Emissions|CHCl3' 'Emissions|H

In [7]:
def add_missing_species(proj_df, missing_list):
    proj_df.reset_index(drop=True, inplace=True)
    for var in missing_list:
        for scen in proj_df.Scenario.unique():
            mdl = proj_df.Model.values[0]
            reg = proj_df.Region.values[0]
            unit = hist_emis.loc[hist_emis.Variable == var, 'Unit'].values[0]
            hist_val = hist_emis.loc[hist_emis.Variable == var, '2022'].values[0]

            all_list = [mdl, scen, reg, var, unit, [hist_val] * (len(proj_df.columns) - 5)]
            all_list.extend(all_list.pop())

            proj_df.loc[len(proj_df.index)] = all_list
            
    return proj_df

In [8]:
# GCAM_proj_emis = add_missing_species(GCAM_proj_emis, GCAM_missing)

# MSG_proj_emis = add_missing_species(MSG_proj_emis, MSG_missing)

# REM_proj_emis = add_missing_species(REM_proj_emis, REM_missing)

### Fix Units

In [9]:
a = hist_emis['Unit'].unique()
b = GCAM_proj_emis['Unit'].unique()
c = MSG_proj_emis['Unit'].unique()
d = MSG_proj_emis['Unit'].unique()

print('\nunits in proj but not hist:')
print('\nGCAM')
print(np.setdiff1d(b, a))
print('\nMESSAGE\n')
print(np.setdiff1d(c, a))
print('\nREMIND')
print(np.setdiff1d(d, a))


units in proj but not hist:

GCAM
['Mt CO2/yr' 'kt N2O/yr']

MESSAGE

['Mt CO2/yr' 'kt N2O/yr']

REMIND
['Mt CO2/yr' 'kt N2O/yr']


Need to fix units for the following variables: 
- CO2 FFI: Mt &rarr; Gt
- CO2 AFOLU: Mt &rarr; Gt
- N2O: kt &rarr; Mt

In [10]:
def adjust_units(proj_df, hist_df, var, factor):
    # select relevant columns
    cols = proj_df.columns.values[5:]
    
    # multiply emissions values by scaling factor
    x = proj_df.loc[proj_df.Variable == var, cols] * factor
    
    # update df with calculated values
    proj_df.loc[proj_df.Variable == var, cols] = x
    
    # change unit name
    proj_df.loc[proj_df.Variable == var, 'Unit'] = hist_df.loc[hist_df.Variable == var].values[0, 4]

In [11]:
# adjust units in MSG & GCAM
for df in [MSG_proj_emis, GCAM_proj_emis, REM_proj_emis]:
    for gas in ['Emissions|N2O', 'Emissions|CO2 FFI', 'Emissions|CO2 AFOLU']:
        adjust_units(df, hist_emis, gas, 0.001)

## Save cleaned dfs

In [12]:
MSG_proj_emis.to_csv('~/outputs/NGFS_MSG_cleaned.csv', index=False)
GCAM_proj_emis.to_csv('~/outputs/NGFS_GCAM_cleaned.csv', index=False)
REM_proj_emis.to_csv('~/outputs/NGFS_REM_cleaned.csv', index=False)

## Interpolate projections
- create a new df with annual timesteps as column names
- for each row in the old df
    - interpolate values
    - add new values to the new df

In [13]:
def NGFS_interp(df, yrs, all_yrs):
    
    # create empty df
    df_interp = pd.DataFrame()
    df_interp[['Model', 'Scenario', 'Region', 'Variable', 'Unit']] = np.nan
    df_interp[all_yrs] = np.nan
    
    mdl = df.Model.values[0]
    reg = df.Region.values[0]
    
    # interpolate and add all values in timeseries
    for scen in df.Scenario.unique():
        for var in df.Variable.unique():
            proj = df.loc[df.Variable == var].loc[df.Scenario == scen]
            proj = proj.values[0][5:].astype(float)
            proj_interp = np.interp(all_yrs, yrs, proj)

            # create a list of the values we want to add to our new_df
            unit = df.loc[df.Variable == var].loc[df.Scenario == scen]['Unit'].values[0]
            lst = [mdl, scen, reg, var, unit] 
            for val in proj_interp.tolist():
                lst.append(val)

            # add to new_df
            df_interp.loc[len(df_interp)] = lst
            
    return df_interp

In [14]:
all_yrs = np.arange(2020, 2101)  # all years to include in interpolated df

# years included in non-interpolated df
GCAM_yrs = np.arange(2020, 2101, 5)
MSG_yrs = [2020, 2025, 2030, 2035, 2040,
       2045, 2050, 2055, 2060, 2070, 2080, 2090, 2100] 
REM_yrs = MSG_yrs

Run interpolations & save as .csv

In [15]:
MSG_interp = NGFS_interp(MSG_proj_emis, MSG_yrs, all_yrs)
GCAM_interp = NGFS_interp(GCAM_proj_emis, GCAM_yrs, all_yrs)
REM_interp = NGFS_interp(REM_proj_emis, REM_yrs, all_yrs)

GCAM_interp.to_csv('~/outputs/NGFS_GCAM_interp.csv', index=False)
MSG_interp.to_csv('~/outputs/NGFS_MSG_interp.csv', index=False)
REM_interp.to_csv('~/outputs/NGFS_REM_interp.csv', index=False)