# Pre process and Import Libraries

In [1]:
from kdm.models import KDMSequentialJointClassModel
import kdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import numpy as np
from keras import metrics
from keras import losses
from keras import optimizers
from keras.models import Model
from keras.layers import Input, Dense
import keras
from pandas import read_csv, DataFrame
import os
os.environ["KERAS_BACKEND"] = "tensorflow"


In [180]:
def annual_growth(row, years):
    min_year = years["min"]
    max_year = years["max"]
    row["Indicator Name"] = row["Indicator Name"] + " - [annual growth %]"
    for year in range(max_year, min_year, -1):
        if not np.isnan(row[str(year)]) and not np.isnan(row[str(year - 1)]):
            row[str(year)] = 100 * (float(row[str(year)]) -
                                    float(row[str(year - 1)])) / abs(float(row[str(year - 1)]))
        else:
            row[str(year)] = np.nan
    row[str(min_year)] = np.nan
    return row


In [181]:

def boundary_str(start, end, tier):
    return f'{tier}: {start:+0,.2f} to {end:+0,.2f}'


def relabel(v, boundaries):
    if v >= boundaries[0][0] and v <= boundaries[0][1]:
        return boundary_str(boundaries[0][0], boundaries[0][1], tier='A')
    elif v >= boundaries[1][0] and v <= boundaries[1][1]:
        return boundary_str(boundaries[1][0], boundaries[1][1], tier='B')
    elif v >= boundaries[2][0] and v <= boundaries[2][1]:
        return boundary_str(boundaries[2][0], boundaries[2][1], tier='C')
    else:
        return np.nan


def relabel_array(v, boundaries):
    if v >= boundaries[0][0] and v <= boundaries[0][1]:
        return [1, 0, 0]
    elif v >= boundaries[1][0] and v <= boundaries[1][1]:
        return [0, 1, 0]
    elif v >= boundaries[2][0] and v <= boundaries[2][1]:
        return [0, 0, 1]
    else:
        return [np.nan]


def get_boundaries(tiers):
    prev_tier = tiers[0]
    boundaries = [(prev_tier[0], prev_tier[prev_tier.shape[0] - 1])]
    for index, tier in enumerate(tiers):
        if index is not 0:
            boundaries.append(
                (prev_tier[prev_tier.shape[0] - 1], tier[tier.shape[0] - 1]))
            prev_tier = tier
    return boundaries


  if index is not 0:


In [182]:


years = {"min": 1960, "max": 2019}

df_raw = read_csv("./italy-raw-data.csv")
df_raw_growth = DataFrame(data=[row if "growth" in row["Indicator Name"] else annual_growth(
    row, years) for index, row in df_raw.iterrows()])

nodes = ['Pop', 'Urb', 'GDP', 'EC', 'FFEC', 'REC', 'EI', 'CO2', 'CH4', 'N2O']
df_growth = df_raw_growth.transpose().iloc[4:]
df_growth.columns = nodes
TIERS_NUM = 3

In [183]:

new_columns = {}
boundaries_map = {}
for i, content in enumerate(df_growth.items()):
    (label, series) = content
    values = np.sort(
        np.array([x for x in series.tolist() if not np.isnan(x)], dtype=float))
    if values.shape[0] < TIERS_NUM:
        print(f'Error: there are not enough data for label {label}')
        break
    boundaries = get_boundaries(tiers=np.array_split(values, TIERS_NUM))
    new_columns[label] = [relabel(value, boundaries)
                          for value in series.tolist()]
    boundaries_map[label] = boundaries

df = DataFrame(data=new_columns)
df.columns = nodes
df.index = range(years["min"], years["max"] + 1)

In [184]:

def get_joint_distribution(states):
    """
    Compute the joint distribution from a list of one-hot encoded state arrays.

    :param states: A list of arrays, each array is a one-hot encoding of a variable state.
    :return: A numpy array representing the joint distribution.
    """
    # Calculate the joint distribution by outer product and then flatten the result
    joint_dist = np.array(states[0])
    for state in states[1:]:
        joint_dist = np.outer(joint_dist, np.array(state))
    joint_dist = joint_dist.flatten()

    return joint_dist


def process_per_node_parsed(df, inputs, outputs, parsed=False):
    data_input = []
    data_output = []
    index = 0
    for row in df.iterrows():
        dt = row[1]
        inpt = []
        if parsed:
            parsed_inpts = []
            for i in inputs:
                value = relabel_array(dt[i], boundaries_map[i])
                parsed_inpts.append(value)
            inpt = get_joint_distribution(parsed_inpts)
        else:
            for i in inputs:
                # get from df_growth the row index and column i
                inpt.append(df_growth.loc[str(index + 1960), i])

        outpt = []
        parsed_outpts = []
        for i in outputs:
            value = relabel_array(dt[i], boundaries_map[i])
            parsed_outpts.append(value)
        outpt = get_joint_distribution(parsed_outpts)

        # if full input and full output
        if not any(np.isnan(inpt)) and not any(np.isnan(outpt)):
            data_input.append(inpt)
            data_output.append(outpt)
        index += 1
    return data_input, data_output

# Inference CO2

## Process and Output Layers

In [292]:

encoded_size = 3
dim_y = 3
encoder = keras.layers.Identity()
n_comp = 55
sequences = [
    {
        'type': 'merge'
    },
    [{
        #'kernel': kdm.layers.RBFKernelLayer(sigma=0.1, dim=3, trainable=True),
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 25
    }],
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 55
    }],
    {
        'type': 'merge'
    },
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 9,
        'dim_y': 3,
        'n_comp': 24
    }],
    {
        'type': 'merge'
    },]

## Model

In [293]:
kdm_model = KDMSequentialJointClassModel(encoded_size=encoded_size,
                                         dim_y=dim_y,
                                         encoder=encoder,
                                         n_comp=n_comp,
                                         sequences=sequences,
                                         sigma=0.5)

In [294]:
kdm_model.compile(optimizer=optimizers.Adam(learning_rate=5e-5),
                  loss=losses.sparse_categorical_crossentropy,
                  metrics=[metrics.sparse_categorical_accuracy])

## Process Data

In [295]:
inputs = [[['Pop', 'Urb', 'GDP']], [['EC'], ['EC']], [
    ['REC', 'FFEC']]]
outputs = [[['EC']], [['REC'], ['FFEC']], [['CO2']]]

In [296]:

full_data = []
for i in range(len(inputs)):
    arr = []
    for j in range(len(inputs[i])):
        data_input, data_output = process_per_node_parsed(
            df_growth, inputs[i][j], outputs[i][j], parsed=(i != 0))
        arr.append((data_input, data_output))
    full_data.append(arr)

In [297]:

idx = 0
for i in range(len(full_data)):

    for j in range(len(full_data[i])):

        data_input, data_output = full_data[i][j]
        data_input = np.array(data_input)
        data_output = np.array(data_output)
        kdm_model.init_components(data_input, data_output,
                                  init_sigma=idx == 0, sigma_mult=0.5, super_index=idx+j, index=0)
    idx += len(full_data[i]) + 1


In [298]:

X = []
y = []
y_int = []
index = 0
for row in df_growth.iterrows():
    dt = row[1]
    inpt = [df_growth.loc[str(index + 1960), 'Pop'], df_growth.loc[str(
        index + 1960), 'Urb'], df_growth.loc[str(index + 1960), 'GDP']]
    value_1 = relabel_array(dt['CO2'], boundaries_map['CO2'])
    if not any(np.isnan(inpt)) and not any(np.isnan(value_1)):
        X.append(inpt)
        value = get_joint_distribution([value_1])
        y.append(value)
        y_int.append(list(value).index(1))
    index += 1


In [299]:
check_1 = kdm_model.evaluate(np.array(X), np.array(y_int))


0
1
2
3
4
5
6
Tensor("einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_13_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_13_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.1375 - sparse_categorical_accuracy: 0.4213  


In [300]:

print('Full Dataset loss:', check_1[0])
print('Full Dataset accuracy:', check_1[1])

Full Dataset loss: 1.0656013488769531
Full Dataset accuracy: 0.4444444477558136


In [301]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y_int, test_size=0.1)

print(len(X_train))
print(len(X_test))

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

48
6


## Train

In [305]:
history = kdm_model.fit(
    np.array(X_train),  # Your training data
    np.array(y_train),  # Your training labels
    batch_size=4,
    epochs=300,  # Start with 100 epochs and use early stopping
    verbose=1,  # Detailed logging
    validation_split=0.1,  # Explicit validation data
    shuffle=True  # Shuffle the data
)

Epoch 1/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.4363 - sparse_categorical_accuracy: 0.7615 - val_loss: 0.4297 - val_sparse_categorical_accuracy: 1.0000
Epoch 2/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3782 - sparse_categorical_accuracy: 0.8442 - val_loss: 0.4290 - val_sparse_categorical_accuracy: 1.0000
Epoch 3/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4144 - sparse_categorical_accuracy: 0.7901 - val_loss: 0.4299 - val_sparse_categorical_accuracy: 1.0000
Epoch 4/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4316 - sparse_categorical_accuracy: 0.7950 - val_loss: 0.4338 - val_sparse_categorical_accuracy: 0.8000
Epoch 5/300
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5349 - sparse_categorical_accuracy: 0.7627 - val_loss: 0.4338 - val_sparse_categorical_accuracy: 0.8000
Epoch

In [306]:

check_2 = kdm_model.evaluate(np.array(X), np.array(y_int))
check_3 = kdm_model.evaluate(np.array(X_test), np.array(y_test))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5028 - sparse_categorical_accuracy: 0.8140 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 1.1815 - sparse_categorical_accuracy: 0.5000


## Result

In [308]:

print('Full Dataset loss:', check_2[0])
print('Full Dataset accuracy:', check_2[1])


print('Test loss:', check_3[0])
print('Test accuracy:', check_3[1])


Full Dataset loss: 0.4896577298641205
Full Dataset accuracy: 0.8148148059844971
Test loss: 1.1815389394760132
Test accuracy: 0.5


# Inference CH4

In [309]:

encoded_size = 3
dim_y = 3
encoder = keras.layers.Identity()
n_comp = 55
sequences = [
    {
        'type': 'merge'
    },
    [{
        #'kernel': kdm.layers.RBFKernelLayer(sigma=0.1, dim=3, trainable=True),
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 25
    }],
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 55
    }],
    {
        'type': 'merge'
    },
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 9,
        'dim_y': 3,
        'n_comp': 18
    }],
    {
        'type': 'merge'
    },]

In [310]:
kdm_model = KDMSequentialJointClassModel(encoded_size=encoded_size,
                                         dim_y=dim_y,
                                         encoder=encoder,
                                         n_comp=n_comp,
                                         sequences=sequences,
                                         sigma=0.5)

kdm_model.compile(optimizer=optimizers.Adam(learning_rate=5e-5),
                  loss=losses.sparse_categorical_crossentropy,
                  metrics=[metrics.sparse_categorical_accuracy])

In [311]:
inputs = [[['Pop', 'Urb', 'GDP']], [['EC'], ['EC']], [
    ['REC', 'FFEC']]]
outputs = [[['EC']], [['REC'], ['FFEC']], [['CH4']]]

In [312]:

full_data = []
for i in range(len(inputs)):
    arr = []
    for j in range(len(inputs[i])):
        data_input, data_output = process_per_node_parsed(
            df_growth, inputs[i][j], outputs[i][j], parsed=(i != 0))
        arr.append((data_input, data_output))
    full_data.append(arr)

In [313]:

idx = 0
for i in range(len(full_data)):

    for j in range(len(full_data[i])):

        data_input, data_output = full_data[i][j]
        data_input = np.array(data_input)
        data_output = np.array(data_output)
        kdm_model.init_components(data_input, data_output,
                                  init_sigma=idx == 0, sigma_mult=0.5, super_index=idx+j, index=0)
    idx += len(full_data[i]) + 1


In [314]:

X = []
y = []
y_int = []
index = 0
for row in df_growth.iterrows():
    dt = row[1]
    inpt = [df_growth.loc[str(index + 1960), 'Pop'], df_growth.loc[str(
        index + 1960), 'Urb'], df_growth.loc[str(index + 1960), 'GDP']]
    value_1 = relabel_array(dt['CH4'], boundaries_map['CH4'])
    if not any(np.isnan(inpt)) and not any(np.isnan(value_1)):
        X.append(inpt)
        value = get_joint_distribution([value_1])
        y.append(value)
        y_int.append(list(value).index(1))
    index += 1


In [315]:
check_1 = kdm_model.evaluate(np.array(X), np.array(y_int))

print('Full Dataset loss:', check_1[0])
print('Full Dataset accuracy:', check_1[1])

0
1
2
3
4
5
6
Tensor("einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_14_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_14_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.9225 - sparse_categorical_accuracy: 0.2763  
Full Dataset loss: 5.616894721984863
Full Dataset accuracy: 0.28947368264198303


In [319]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y_int, test_size=0.1)

print(len(X_train))
print(len(X_test))

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

history = kdm_model.fit(
    np.array(X_train), 
    np.array(y_train),
    batch_size=4,
    epochs=1000,  
    verbose=1,
    validation_split=0.1, 
    shuffle=True
)

check_2 = kdm_model.evaluate(np.array(X), np.array(y_int))
check_3 = kdm_model.evaluate(np.array(X_test), np.array(y_test))


print('Full Dataset loss:', check_2[0])
print('Full Dataset accuracy:', check_2[1])


print('Test loss:', check_3[0])
print('Test accuracy:', check_3[1])


34
4
Epoch 1/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5.2802 - sparse_categorical_accuracy: 0.6258 - val_loss: 4.0337 - val_sparse_categorical_accuracy: 0.7500
Epoch 2/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.3489 - sparse_categorical_accuracy: 0.5992 - val_loss: 4.0326 - val_sparse_categorical_accuracy: 0.7500
Epoch 3/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.2818 - sparse_categorical_accuracy: 0.6552 - val_loss: 4.0325 - val_sparse_categorical_accuracy: 0.7500
Epoch 4/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.3391 - sparse_categorical_accuracy: 0.5300 - val_loss: 4.0322 - val_sparse_categorical_accuracy: 0.7500
Epoch 5/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6.3413 - sparse_categorical_accuracy: 0.5631 - val_loss: 4.0343 - val_sparse_categorical_accuracy: 0.7500
Epoch

# Inference N20

In [328]:

encoded_size = 3
dim_y = 3
encoder = keras.layers.Identity()
n_comp = 55
sequences = [
    {
        'type': 'merge'
    },
    [{
        #'kernel': kdm.layers.RBFKernelLayer(sigma=0.1, dim=3, trainable=True),
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 25
    }],
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 3,
        'dim_y': 3,
        'n_comp': 55
    }],
    {
        'type': 'merge'
    },
    [{
        'kernel': kdm.layers.CosineKernelLayer(),
        'dim_x': 9,
        'dim_y': 3,
        'n_comp': 18
    }],
    {
        'type': 'merge'
    },]

In [329]:
kdm_model = KDMSequentialJointClassModel(encoded_size=encoded_size,
                                         dim_y=dim_y,
                                         encoder=encoder,
                                         n_comp=n_comp,
                                         sequences=sequences,
                                         sigma=0.5)

kdm_model.compile(optimizer=optimizers.Adam(learning_rate=5e-5),
                  loss=losses.sparse_categorical_crossentropy,
                  metrics=[metrics.sparse_categorical_accuracy])

In [330]:
inputs = [[['Pop', 'Urb', 'GDP']], [['EC'], ['EC']], [
    ['REC', 'FFEC']]]
outputs = [[['EC']], [['REC'], ['FFEC']], [['N2O']]]

In [331]:

full_data = []
for i in range(len(inputs)):
    arr = []
    for j in range(len(inputs[i])):
        data_input, data_output = process_per_node_parsed(
            df_growth, inputs[i][j], outputs[i][j], parsed=(i != 0))
        arr.append((data_input, data_output))
    full_data.append(arr)

In [332]:

idx = 0
for i in range(len(full_data)):

    for j in range(len(full_data[i])):

        data_input, data_output = full_data[i][j]
        data_input = np.array(data_input)
        data_output = np.array(data_output)
        kdm_model.init_components(data_input, data_output,
                                  init_sigma=idx == 0, sigma_mult=0.5, super_index=idx+j, index=0)
    idx += len(full_data[i]) + 1


In [333]:

X = []
y = []
y_int = []
index = 0
for row in df_growth.iterrows():
    dt = row[1]
    inpt = [df_growth.loc[str(index + 1960), 'Pop'], df_growth.loc[str(
        index + 1960), 'Urb'], df_growth.loc[str(index + 1960), 'GDP']]
    value_1 = relabel_array(dt['N2O'], boundaries_map['N2O'])
    if not any(np.isnan(inpt)) and not any(np.isnan(value_1)):
        X.append(inpt)
        value = get_joint_distribution([value_1])
        y.append(value)
        y_int.append(list(value).index(1))
    index += 1


In [334]:
check_1 = kdm_model.evaluate(np.array(X), np.array(y_int))

print('Test loss:', check_1[0])
print('Test accuracy:', check_1[1])

0
1
2
3
4
5
6
Tensor("einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_16_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
0
1
2
3
4
5
6
Tensor("kdm_sequential_joint_class_model_16_1/einsum_3/Einsum:0", shape=(None, 3), dtype=float32)
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.0859 - sparse_categorical_accuracy: 0.4441  
Test loss: 1.084470510482788
Test accuracy: 0.44736841320991516


In [336]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y_int, test_size=0.1)

print(len(X_train))
print(len(X_test))

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

history = kdm_model.fit(
    np.array(X_train),  # Your training data
    np.array(y_train),  # Your training labels
    batch_size=4,
    epochs=1000,  # Start with 100 epochs and use early stopping
    verbose=1,  # Detailed logging
    validation_split=0.1,  # Explicit validation data
    shuffle=True  # Shuffle the data
)

check_2 = kdm_model.evaluate(np.array(X), np.array(y_int))
check_3 = kdm_model.evaluate(np.array(X_test), np.array(y_test))


print('Full Dataset loss:', check_2[0])
print('Full Dataset accuracy:', check_2[1])


print('Test loss:', check_3[0])
print('Test accuracy:', check_3[1])


34
4
Epoch 1/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.7527 - sparse_categorical_accuracy: 0.6877 - val_loss: 0.9143 - val_sparse_categorical_accuracy: 0.5000
Epoch 2/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7349 - sparse_categorical_accuracy: 0.7360 - val_loss: 0.9165 - val_sparse_categorical_accuracy: 0.5000
Epoch 3/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8833 - sparse_categorical_accuracy: 0.7162 - val_loss: 0.9174 - val_sparse_categorical_accuracy: 0.5000
Epoch 4/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.0671 - sparse_categorical_accuracy: 0.6602 - val_loss: 0.9207 - val_sparse_categorical_accuracy: 0.5000
Epoch 5/1000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6461 - sparse_categorical_accuracy: 0.8176 - val_loss: 0.9262 - val_sparse_categorical_accuracy: 0.5000
Epoch