En faisant le produit avec la matrice adaptative si jamais un pays se retrouve seul dans une classe, toute la colonne de cette classe se retrouve à zéro.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.decomposition import PCA

from adjency_matrix import AdaptativeMatrix, DTW,AgeReduction
from gt_a_model import GCNLayer, GCNMultiLayer, Encoder

In [2]:
AM = AdaptativeMatrix()
dtw = DTW()
AR = AgeReduction()

In [3]:
def adjency_matrix(X: torch.Tensor, coord_countries: torch.Tensor):
    """
    X is 3D tensor of mortality rates
    coord_countries is 2D matrix contains longitude and latitude of each country. 1st column represents long and 2nd represents lat
    """
    n_countries  = X.shape[0]
    clusters = AM.adaptative_matrix(X)
    dtw_matrix = dtw.DTW_Matrix(X)
    long_lat_matrix = np.empty((n_countries, n_countries))
    for i in range(n_countries):
        for j in range(n_countries):
            long_lat_matrix[i,j] = torch.sqrt(torch.mean((coord_countries[i]-coord_countries[j])**2))
    
    # 
    adjency_mat = dtw_matrix * long_lat_matrix
    #print("adj_brut_matrix \n",adjency_mat)
    for i in range(n_countries):
        for j in range(n_countries):
            if clusters[i] != clusters[j]:
                adjency_mat[i,j] = 0.005
    
    #print("long_lat \n",long_lat_matrix)
    print("clusters \n",clusters)
    return adjency_mat


In [19]:
def tensor_2D_3D_product( A, B):
        a_size = A.shape
        b_size = B.shape
        C = torch.Tensor(b_size)
        for i in range(a_size[0]):
            for j in range(a_size[1]):
                C[i] += (A[i,j] * B[j])
    
        return C.float()

### 2) Importation des données de mortalité


In [4]:
### import mortality data for the different countries
import os
 
directory = '/Users/gojelastat/Desktop/Thèse/Projet 2/Données/Données GT-A'
data={}
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and filename.endswith('.txt'):
        data[filename.split('.')[0]]=pd.read_csv(f,header=1,delimiter="\s+") ### filename.split('.')[0] split the name by . and recover the code of the country
        ### in this line, we fill the dictionary "data" with the different data of country and their code

In [5]:
len(data.keys())

22

### 3) Mettre les données au format adéquat 
C'est-à-dire sous forme de tenseur de taille M x T x A où M est le nombre de pays, T est la dimension du temps et A est la dimension de l'âge. Rappelons que ceci est fait pour s'adapter au cadre d'implémentation du modèle GT-A et ici on travaille avec les données des **hommes**

In [6]:
#fonction permettant de transformer les données dans la bonne numérisation
def __data_load(data):
    columns=data.columns
    data['Age']=np.where(data['Age']!='110+',data['Age'],111)
    for col in columns:
        data[col]=np.where(data[col]!='.',data[col],9999)
        data[col]=pd.to_numeric(data[col])

    data=data[data['Age']<100]
    data=data[data['Year']>=1950]
    data=data[data['Year']<=2010]

    #df_min=np.min(data['Male'])
    #df_max=np.max(data['Male'])

    #data['Male']=(data['Male']-df_min)/(df_max-df_min)
    data.index=np.arange(data.shape[0])  ### renommer les index de 0 jusqu'à la taille de data
    return data#,df_min,df_max



### Transform the based dataset to matrix of mortality rates of male people in this case. 
#The matrix has age on row and year on columns
def __data_reshaping(data,Gender="Male"):
    data=__data_load(data)
    mat=pd.DataFrame(index=np.unique(data['Age']),columns=np.unique(data['Year']))
    n=0
    for j in range(mat.shape[1]):
        for i in range(mat.shape[0]):
            mat.iloc[mat.index[i],mat.index[j]]=data.loc[n+i,Gender]
        n=n+mat.shape[0]
    years=np.arange(1950,(1950+mat.shape[1]))
    ages=np.arange(0,(mat.shape[0]))

    #for col in years:
    #    mat[col]=pd.to_numeric(mat[col])
    return np.array(mat.T, dtype=np.float32) # ici on essaie de convertir en un tableau floattant, la transposée c'est parce que le code
    # était fait pour sortir une matrice de taille A x T.



In [7]:
T = 2010 - 1950 + 1
A = 99 - 0 + 1
X = torch.FloatTensor(len(data.keys()), T, A)

countries = data.keys()
for i,country in enumerate(countries):
    X[i]= torch.tensor(__data_reshaping(data[country]))

In [8]:
X.shape

torch.Size([22, 61, 100])

#### Coordonnées géographique des pays 
Ici on oriente le globe en un repère orthonormé avec l'axe des abscisses orienté vers l'Est et l'axe des ordonnées orienté vers le Nord. Ces coordonnées ont été pris sur wikipédia. Dans la base de données HMD, il y a la base de données par régions du UK et pour toute la population UK. On a choisi d'utiliser la base du UK total.

In [9]:
country = ['AUT','AUS','GBR_NP','ISL','NZL_NP', 'HUN', 'DNK', 'USA', 'JPN',
           'SWE', 'FIN', 'CZE', 'BGR', 'SVK', 'IRL', 'CAN', 'PRT', 'BEL', 'ESP',
           'FRACNP', 'NLD', 'ITA']
coord = np.array([[47.20, 13.20],[-27.00, 133.00], [54.00, -2.30], [65.00, -18.00],
                 [-41.00, 174.00], [47.00, 20.00], [56.00, 10.00], [38.00, -97.00],
                 [36.00, 138.00], [62.00, 15.00], [64.00, 26.00], [49.00, 15.00],
                 [43.00, 25.00], [48.40, 19.30], [53.00, -8.00], [60.00, -95.00],
                 [39.30, -8.00], [50.50, 4.00], [40.00, -4.00], [47.00, 2.00], 
                 [51.55, 5.34], [42.5001, 12.5001]])

geo_coord = pd.DataFrame(coord, index=country, columns=['Latitude', 'Longitude'])
geo_coord = geo_coord.loc[list(data.keys())]
geo_coord

Unnamed: 0,Latitude,Longitude
PRT,39.3,-8.0
GBR_NP,54.0,-2.3
BEL,50.5,4.0
AUS,-27.0,133.0
ESP,40.0,-4.0
ISL,65.0,-18.0
AUT,47.2,13.2
NZL_NP,-41.0,174.0
HUN,47.0,20.0
FRACNP,47.0,2.0


### 4) Calcul de la matrice $A_{lat-long}$

In [10]:
n_countries = len(data.keys())
A_lat_long = torch.FloatTensor(n_countries, n_countries)
for i, country_1 in enumerate(data.keys()):
    for j, country_2 in enumerate(data.keys()):
        A_lat_long[i,j] = np.sqrt(np.mean((geo_coord.loc[country_1]-geo_coord.loc[country_2])**2))

A_lat_long

tensor([[  0.0000,  11.1485,  11.6069, 110.1742,   2.8714,  19.4999,  15.9977,
         140.6629,  20.5340,   8.9244,  12.8066,  17.3622,  62.9392, 103.2640,
          22.8505,  29.7161,  17.6506,  14.6713,  23.4807,  20.3482,   9.6874,
          63.2356],
        [ 11.1485,   0.0000,   5.0961, 111.5058,   9.9722,  13.5553,  11.9685,
         141.6098,  16.5271,   5.8090,   5.6733,   8.8116,  67.9120, 100.0202,
          13.4776,  21.2237,  12.7336,  13.2531,  20.8121,  15.7785,   4.0921,
          65.6860],
        [ 11.6069,   5.0961,   0.0000, 106.4125,   9.3341,  18.6313,   6.9112,
         136.5142,  11.5812,   2.8504,   1.2038,   5.7554,  71.9627,  95.3054,
          11.2528,  18.2517,   7.8502,   8.2538,  15.7678,  10.9202,   8.6675,
          70.3251],
        [110.1742, 111.5058, 106.4125,   0.0000, 107.8378, 125.0300,  99.6436,
          30.6349,  95.5118, 106.3884, 105.9886, 104.9238, 169.0044,  44.6878,
         104.5108,  99.3227,  99.2472,  98.3628,  91.0055,  96.4698, 11

In [11]:
# Cette partie permet de convertir le dataframe en numpy floattant
geo_coord = torch.FloatTensor(np.array(geo_coord, dtype=np.float32))

In [12]:
A = adjency_matrix(X, geo_coord)

The variance explained by principal components:  [0.51355214]
The variance explained by principal components:  [0.77983756]
The variance explained by principal components:  [0.38844029]
The variance explained by principal components:  [0.54289204]
The variance explained by principal components:  [0.6868258]
The variance explained by principal components:  [0.99999979]
The variance explained by principal components:  [0.41280576]
The variance explained by principal components:  [0.31332616]
The variance explained by principal components:  [0.62181391]
The variance explained by principal components:  [0.76414144]
The variance explained by principal components:  [0.59014299]
The variance explained by principal components:  [0.39350316]
The variance explained by principal components:  [0.76954034]
The variance explained by principal components:  [0.7843422]
The variance explained by principal components:  [0.44021979]
The variance explained by principal components:  [0.51414641]
The varian

## 6) Modélisation 

In [13]:
from gt_a_model import GTA_Model
from torch import optim

In [14]:
m1 = GCNLayer(in_features=100, out_features=100)
m1(X,A)

RuntimeError: self must be a matrix

In [23]:
import torch

# Tenseur 3D : par exemple, un batch de matrices 2D
tensor_3d = torch.randn(5, 4, 6)  # (batch_size, m, n)

# Tenseur 2D : par exemple, une matrice (n, p)
tensor_2d = torch.randn(6, 3)  # (n, p)

# Produit matriciel entre chaque "matrice 2D" du tenseur 3D et le tenseur 2D
result = torch.matmul(tensor_3d, tensor_2d)

print(result.shape)  # Résultat : (batch_size, m, p) -> (5, 4, 3)


torch.Size([5, 4, 3])


In [29]:
X = torch.rand(5,7,5)
A = torch.rand(1,7,7)
torch.bmm(A,X).shape

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [1, 7] but got: [5, 7].

In [19]:
model = GTA_Model()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

model(X,A)

# Boucle d'entraînement simple
for epoch in range(200):
    # Zéro le gradient
    optimizer.zero_grad()
    
    # Forward pass
    output = model(X, A)
    
    # Calcul de la perte
    loss = criterion(output, Y)
    
    # Backward pass et optimisation
    loss.backward()
    optimizer.step()
    
    # Affichage des informations
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')



RuntimeError: self must be a matrix