En faisant le produit avec la matrice adaptative si jamais un pays se retrouve seul dans une classe, toute la colonne de cette classe se retrouve à zéro.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.decomposition import PCA

from adjency_matrix import AdaptativeMatrix, DTW,AgeReduction
from gt_a_model import GCNLayer, GCNMultiLayer, Encoder, EncoderLayer

In [2]:
AM = AdaptativeMatrix()
dtw = DTW()
AR = AgeReduction()

In [3]:
def adjency_matrix(X: torch.Tensor, coord_countries: torch.Tensor):
    """
    X is 3D tensor of mortality rates
    coord_countries is 2D matrix contains longitude and latitude of each country. 1st column represents long and 2nd represents lat
    """
    n_countries  = X.shape[0]
    clusters = AM.adaptative_matrix(X)
    dtw_matrix = dtw.DTW_Matrix(X)
    long_lat_matrix = np.empty((n_countries, n_countries))
    for i in range(n_countries):
        for j in range(n_countries):
            long_lat_matrix[i,j] = torch.sqrt(torch.mean((coord_countries[i]-coord_countries[j])**2))
    
    # 
    adjency_mat = dtw_matrix * long_lat_matrix
    #print("adj_brut_matrix \n",adjency_mat)
    for i in range(n_countries):
        for j in range(n_countries):
            if clusters[i] != clusters[j]:
                adjency_mat[i,j] = 0.005
    
    #print("long_lat \n",long_lat_matrix)
    print("clusters \n",clusters)
    return adjency_mat


### 2) Importation des données de mortalité


In [4]:
### import mortality data for the different countries
import os
 
directory = '/Users/gojelastat/Desktop/Thèse/Projet 2/Données/Données GT-A'
data={}
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and filename.endswith('.txt'):
        data[filename.split('.')[0]]=pd.read_csv(f,header=1,delimiter="\s+") ### filename.split('.')[0] split the name by . and recover the code of the country
        ### in this line, we fill the dictionary "data" with the different data of country and their code

In [5]:
len(data.keys())

22

### 3) Mettre les données au format adéquat 
C'est-à-dire sous forme de tenseur de taille M x T x A où M est le nombre de pays, T est la dimension du temps et A est la dimension de l'âge. Rappelons que ceci est fait pour s'adapter au cadre d'implémentation du modèle GT-A et ici on travaille avec les données des **hommes**

In [6]:
#fonction permettant de transformer les données dans la bonne numérisation
def __data_load(data):
    columns=data.columns
    data['Age']=np.where(data['Age']!='110+',data['Age'],111)
    for col in columns:
        data[col]=np.where(data[col]!='.',data[col],9999)
        data[col]=pd.to_numeric(data[col])

    data=data[data['Age']<100] 
    #data=data[data['Age']>=50]
    data=data[data['Year']>=1950]
    data=data[data['Year']<=2010]

    #df_min=np.min(data['Male'])
    #df_max=np.max(data['Male'])

    #data['Male']=(data['Male']-df_min)/(df_max-df_min)
    data.index=np.arange(data.shape[0])  ### renommer les index de 0 jusqu'à la taille de data
    return data#,df_min,df_max


def __test_load(data):
    columns=data.columns
    data['Age']=np.where(data['Age']!='110+',data['Age'],111)
    for col in columns:
        data[col]=np.where(data[col]!='.',data[col],9999)
        data[col]=pd.to_numeric(data[col])

    data=data[data['Age']<100]
    #data=data[data['Age']>=50]
    data=data[data['Year']>=2011]
    data=data[data['Year']<=2019]

    #df_min=np.min(data['Male'])
    #df_max=np.max(data['Male'])

    #data['Male']=(data['Male']-df_min)/(df_max-df_min)
    data.index=np.arange(data.shape[0])  ### renommer les index de 0 jusqu'à la taille de data
    return data#,df_min,df_max


### Transform the based dataset to matrix of mortality rates of male people in this case. 
#The matrix has age on row and year on columns
def __data_reshaping(data,Gender="Male"):
    data=__data_load(data)
    data[Gender] = np.where(data[Gender] != 0.0, data[Gender], 0.00000001)
    mat=pd.DataFrame(index=np.unique(data['Age']),columns=np.unique(data['Year']))
    n=0
    for j in range(mat.shape[1]):
        for i in range(mat.shape[0]):
            mat.iloc[i,j]=data.loc[n+i,Gender]
        n=n+mat.shape[0]
    years=np.arange(1950,(1950+mat.shape[1]))
    ages=np.arange(0,(mat.shape[0]))

    #for col in years:
    #    mat[col]=pd.to_numeric(mat[col])
    return np.array(mat.T, dtype=np.float32) # ici on essaie de convertir en un tableau floattant, la transposée c'est parce que le code
    # était fait pour sortir une matrice de taille A x T.


def __test_reshaping(data,Gender="Male"):
    data=__test_load(data)
    data[Gender] = np.where(data[Gender] != 0.0, data[Gender], 0.00000001)
    mat=pd.DataFrame(index=np.unique(data['Age']),columns=np.unique(data['Year']))
    n=0
    for j in range(mat.shape[1]):
        for i in range(mat.shape[0]):
            mat.iloc[i,j]=data.loc[n+i,Gender]
        n=n+mat.shape[0]
    years=np.arange(1950,(1950+mat.shape[1]))
    ages=np.arange(0,(mat.shape[0]))

    #for col in years:
    #    mat[col]=pd.to_numeric(mat[col])
    return np.array(mat.T, dtype=np.float32) # ici on essaie de convertir en un tableau floattant, la transposée c'est parce que le code
    # était fait pour sortir une matrice de taille A x T.


In [7]:
### Search for outliers (<=0 or >=1) in male data
#for code, country in data.items():
#    print(data[code][data[code].Male <= 0])

In [8]:
### Search for outliers (<=0 or >=1) in male data
for code, country in data.items():
    print("{} minimum is {} and maximum is {}".format(code,np.min(data[code].Male), np.max(data[code].Male)))

PRT minimum is . and maximum is 6.000000
GBR_NP minimum is . and maximum is 4.340426
BEL minimum is . and maximum is 6.000000
AUS minimum is . and maximum is 6.000000
ESP minimum is . and maximum is 6.000000
ISL minimum is . and maximum is 7.234751
AUT minimum is . and maximum is 6.000000
NZL_NP minimum is . and maximum is 6.000000
HUN minimum is . and maximum is 6.000000
FRACNP minimum is . and maximum is 6.000000
NLD minimum is . and maximum is 6.036331
DNK minimum is . and maximum is 6.000000
USA minimum is 0.000103 and maximum is 1.150146
JPN minimum is . and maximum is 6.000000
SWE minimum is . and maximum is 6.000000
FIN minimum is . and maximum is 6.883500
CZE minimum is . and maximum is 6.000000
ITA minimum is . and maximum is 6.018847
BGR minimum is . and maximum is 6.000000
SVK minimum is . and maximum is 6.000000
IRL minimum is . and maximum is 6.000000
CAN minimum is . and maximum is 6.000000


In [9]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
T = 2010 - 1950 + 1
A = 99 - 0 + 1
T2 = 2019-2011 + 1
X = torch.FloatTensor(len(data.keys()), T, A)
X_norm = torch.FloatTensor(len(data.keys()), T, A)
test = torch.FloatTensor(len(data.keys()), T2, A)
test_norm = torch.FloatTensor(len(data.keys()), T2, A)
countries = data.keys()
sc = []
for i,country in enumerate(countries):
    m = MinMaxScaler()
    X[i]= torch.tensor(__data_reshaping(data[country]))
    X[i] = torch.log(X[i])
    #Version normalisée de X pour garantir la stablité numérique et une mise à l'échelle des caractéristiques)
    X_norm[i] = torch.FloatTensor(m.fit_transform(X[i]))
    #sc.append(m.fit[X])
    test[i] = torch.log(torch.tensor(__test_reshaping(data[country])))


Comment faire la normalisation pour chaque pays et pouvoir récupérer la normalisation pour l'inversion par la suite

#### Coordonnées géographique des pays 
Ici on oriente le globe en un repère orthonormé avec l'axe des abscisses orienté vers l'Est et l'axe des ordonnées orienté vers le Nord. Ces coordonnées ont été pris sur wikipédia. Dans la base de données HMD, il y a la base de données par régions du UK et pour toute la population UK. On a choisi d'utiliser la base du UK total.

In [11]:
country = ['AUT','AUS','GBR_NP','ISL','NZL_NP', 'HUN', 'DNK', 'USA', 'JPN',
           'SWE', 'FIN', 'CZE', 'BGR', 'SVK', 'IRL', 'CAN', 'PRT', 'BEL', 'ESP',
           'FRACNP', 'NLD', 'ITA']
coord = np.array([[47.20, 13.20],[-27.00, 133.00], [54.00, -2.30], [65.00, -18.00],
                 [-41.00, 174.00], [47.00, 20.00], [56.00, 10.00], [38.00, -97.00],
                 [36.00, 138.00], [62.00, 15.00], [64.00, 26.00], [49.00, 15.00],
                 [43.00, 25.00], [48.40, 19.30], [53.00, -8.00], [60.00, -95.00],
                 [39.30, -8.00], [50.50, 4.00], [40.00, -4.00], [47.00, 2.00], 
                 [51.55, 5.34], [42.5001, 12.5001]])

geo_coord = pd.DataFrame(coord, index=country, columns=['Latitude', 'Longitude'])
geo_coord = geo_coord.loc[list(data.keys())]
geo_coord

Unnamed: 0,Latitude,Longitude
PRT,39.3,-8.0
GBR_NP,54.0,-2.3
BEL,50.5,4.0
AUS,-27.0,133.0
ESP,40.0,-4.0
ISL,65.0,-18.0
AUT,47.2,13.2
NZL_NP,-41.0,174.0
HUN,47.0,20.0
FRACNP,47.0,2.0


### 4) Calcul de la matrice $A_{lat-long}$

In [12]:
n_countries = len(data.keys())
A_lat_long = torch.FloatTensor(n_countries, n_countries)
for i, country_1 in enumerate(data.keys()):
    for j, country_2 in enumerate(data.keys()):
        A_lat_long[i,j] = np.sqrt(np.mean((geo_coord.loc[country_1]-geo_coord.loc[country_2])**2))

A_lat_long[1:4,1:4]

tensor([[  0.0000,   5.0961, 111.5058],
        [  5.0961,   0.0000, 106.4125],
        [111.5058, 106.4125,   0.0000]])

In [13]:
# Cette partie permet de convertir le dataframe en numpy floattant
geo_coord = torch.FloatTensor(np.array(geo_coord, dtype=np.float32))

In [14]:
A = adjency_matrix(X, geo_coord)

The variance explained by principal components:  [0.86285386]
The variance explained by principal components:  [0.92847528]
The variance explained by principal components:  [0.82585624]
The variance explained by principal components:  [0.9088234]
The variance explained by principal components:  [0.90072472]
The variance explained by principal components:  [0.29604584]
The variance explained by principal components:  [0.63695517]
The variance explained by principal components:  [0.65376633]
The variance explained by principal components:  [0.68731448]
The variance explained by principal components:  [0.92324138]
The variance explained by principal components:  [0.86803872]
The variance explained by principal components:  [0.59313692]
The variance explained by principal components:  [0.94291138]
The variance explained by principal components:  [0.96681486]
The variance explained by principal components:  [0.7960341]
The variance explained by principal components:  [0.455287]
The variance

## 6) Modélisation 

In [15]:
from gt_a_model import GTA_Model
from torch import optim
from sklearn.preprocessing import MinMaxScaler

In [16]:
X.shape

torch.Size([22, 61, 100])

In [18]:
model = GTA_Model(n_predictions=9)
#model = Encoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

torch.autograd.set_detect_anomaly(True)
# Boucle d'entraînement simple
for epoch in range(200):
# Zéro le gradient
    optimizer.zero_grad()

    # Forward pass
    output = model(X,A)

    # Calcul de la perte
    loss = criterion(output, test)

    # Backward pass et optimisation
    loss.backward()
    optimizer.step()

    # Affichage des informations
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')



Epoch [10/100], Loss: 141.5123
Epoch [20/100], Loss: 108.2157
Epoch [30/100], Loss: 40605757440.0000
Epoch [40/100], Loss: 2156601595133952.0000
Epoch [50/100], Loss: 111482699776.0000
Epoch [60/100], Loss: 2618463354880.0000
Epoch [70/100], Loss: 12029113925632.0000
Epoch [80/100], Loss: 40227787767808.0000
Epoch [90/100], Loss: 13654289285120.0000
Epoch [100/100], Loss: 5138072731648.0000
Epoch [110/100], Loss: 414133911552.0000
Epoch [120/100], Loss: 335823831040.0000
Epoch [130/100], Loss: 3063375616.0000
Epoch [140/100], Loss: 25169936384.0000
Epoch [150/100], Loss: 10710392832.0000
Epoch [160/100], Loss: 1755404928.0000
Epoch [170/100], Loss: 380515552.0000
Epoch [180/100], Loss: 98205328.0000
Epoch [190/100], Loss: 109387096.0000
Epoch [200/100], Loss: 59663180.0000


In [18]:
output.shape

torch.Size([22, 61, 100])

In [None]:
x.isnan().sum()

tensor(105936)

In [None]:
A

tensor([[0.0000e+00, 3.1061e+02, 3.1741e+02, 1.5355e+03, 3.8052e+01, 5.0000e-03,
         2.2646e+02, 4.0601e+03, 8.5495e+02, 1.5768e+02, 4.5506e+02, 8.2135e+02,
         4.8723e+03, 4.6645e+03, 4.3682e+02, 5.0000e-03, 4.7738e+02, 5.6629e+01,
         2.6496e+03, 5.0000e-03, 1.2371e+03, 9.9809e+02],
        [3.1061e+02, 0.0000e+00, 9.1326e+00, 3.1563e+02, 4.7811e+02, 5.0000e-03,
         2.8446e+02, 1.8577e+03, 9.3818e+01, 1.4747e+01, 1.8038e+01, 2.2178e+02,
         3.5972e+02, 1.3345e+04, 6.1400e+01, 5.0000e-03, 3.6593e+01, 1.8500e+02,
         5.4540e+02, 5.0000e-03, 6.0894e+02, 1.2030e+02],
        [3.1741e+02, 9.1326e+00, 0.0000e+00, 4.2388e+02, 3.9960e+02, 5.0000e-03,
         1.6635e+02, 1.8868e+03, 6.8559e+01, 6.1401e+00, 4.7618e+00, 1.5687e+02,
         5.1416e+02, 1.1549e+04, 5.2890e+01, 5.0000e-03, 2.5685e+01, 1.1005e+02,
         5.0560e+02, 5.0000e-03, 1.2148e+03, 1.5343e+02],
        [1.5355e+03, 3.1563e+02, 4.2388e+02, 0.0000e+00, 4.0160e+03, 5.0000e-03,
         1.4092e

In [None]:
A = A + torch.eye(A.shape[0])

x=A/ A.sum(dim=-1,keepdim=True)
x.sum(dim=-1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000], dtype=torch.float64)

In [None]:
x

tensor([[4.1373e-05, 1.2851e-02, 1.3132e-02, 6.3527e-02, 1.5743e-03, 2.0686e-07,
         9.3692e-03, 1.6797e-01, 3.5371e-02, 6.5237e-03, 1.8827e-02, 3.3981e-02,
         2.0158e-01, 1.9298e-01, 1.8072e-02, 2.0686e-07, 1.9750e-02, 2.3429e-03,
         1.0962e-01, 2.0686e-07, 5.1184e-02, 4.1294e-02],
        [1.6463e-02, 5.3001e-05, 4.8404e-04, 1.6729e-02, 2.5341e-02, 2.6501e-07,
         1.5077e-02, 9.8459e-02, 4.9724e-03, 7.8160e-04, 9.5603e-04, 1.1754e-02,
         1.9065e-02, 7.0731e-01, 3.2543e-03, 2.6501e-07, 1.9395e-03, 9.8055e-03,
         2.8907e-02, 2.6501e-07, 3.2275e-02, 6.3759e-03],
        [1.8069e-02, 5.1989e-04, 5.6927e-05, 2.4130e-02, 2.2748e-02, 2.8464e-07,
         9.4696e-03, 1.0741e-01, 3.9029e-03, 3.4954e-04, 2.7108e-04, 8.9299e-03,
         2.9270e-02, 6.5746e-01, 3.0109e-03, 2.8464e-07, 1.4622e-03, 6.2646e-03,
         2.8783e-02, 2.8464e-07, 6.9154e-02, 8.7343e-03],
        [3.4673e-02, 7.1272e-03, 9.5717e-03, 2.2581e-05, 9.0686e-02, 1.1291e-07,
         3.1821e