Dans ce notebook, nous réalisons l'entraînement en utilisant les données de plusieurs stations. Cependant, la machine actuelle manque de puissance pour exécuter les calculs nécessaires. Il serait préférable de tester ce code sur une machine plus performante, idéalement un supercalculateur, afin d'assurer une meilleure efficacité et des temps de traitement réduits.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Input, Bidirectional, Concatenate
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from skimpy import *
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.interpolate import CubicSpline
from tensorflow.keras.callbacks import EarlyStopping
from math import radians, sin, cos, sqrt, atan2

num_stations=3
start_column = 7
tab_hole=[1,2,3,4,5,6]

data = pd.read_csv("meteofrance.csv", sep = ";")
ss = np.unique(data['NUM_POSTE'].values)[:3]
colonnes_a_garder = ['AAAAMMJJHH',  ' T', 'NUM_POSTE', 'LAT', 'LON']
df = data.loc[:,colonnes_a_garder].copy()  # Création d'une copie
df.loc[:, 'AAAAMMJJHH'] = pd.to_datetime(df['AAAAMMJJHH'], format='%Y%m%d%H')
df.rename(columns={
    'AAAAMMJJHH' : 'date',
    ' T': 'temperature'
}, inplace=True)
scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = StandardScaler()
scaler.fit(df[['temperature']])
df['temperature_scaled'] = scaler.transform(df[['temperature']])
#print(df)

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Rayon de la Terre en kilomètres
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance
    
my_df=[]
my_dfs=[]
max_hole=6
N=10
column = "temperature"
for s in ss:
    df_temp = df.loc[df['NUM_POSTE'] == s]
    my_df.append(df_temp)
unique_stations = df.drop_duplicates(subset=['NUM_POSTE'])
for dfi in my_df:
    s= dfi.iloc[0, 2]
    #main_station = df.loc[df['NUM_POSTE'] == s].copy()
    main_station = dfi.copy()
    lat1, lon1 = main_station['LAT'].values[0], main_station['LON'].values[0]
    unique_stations['distance'] = unique_stations.apply(lambda row: haversine(lat1, lon1, row['LAT'], row['LON']), axis=1)
    neighbor_stations = unique_stations[unique_stations['NUM_POSTE'] != s].sort_values('distance').head(num_stations)
    neighbor_stations = np.unique(neighbor_stations['NUM_POSTE'].values)
    i=1
    for station in neighbor_stations:
        station_data = df.loc[df['NUM_POSTE'] == station, ['date', 'temperature', 'temperature_scaled']].rename(columns={'temperature': f'temperature_station{i}', 'temperature_scaled': f'temperature_scaled{i}'})
        #station_data = df.loc[df['NUM_POSTE'] == station, ['date', 'temperature', 'temperature_scaled']].rename(columns={'temperature': f'temperature_station{i}'})
        i=i+1
        dfi = dfi.merge(station_data, on='date', how='left')
    my_dfs.append(dfi)

2024-09-19 01:42:41.924112: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#interpolation lineaire
linear_rmse=[]

def create_gap(df, start_gap, gap_size, column):
    end_gap = start_gap + gap_size
    for i in range(start_gap, end_gap):
        df.at[i, column] = np.nan
    return df
    

for i in range(1, max_hole + 1):
    gap_size= i
    sum=0
    l=0
    for dfi in my_dfs :
        rows_with_nan = dfi[dfi.isna().any(axis=1)].index.tolist()
        rows_with_nan.insert(0, -1)
        rows_with_nan.append(len(dfi))
        for j in range(len(rows_with_nan) - 1):
            start_index= rows_with_nan[j] + 1
            end_index= rows_with_nan[j + 1] - 1
            df_sampled=dfi[start_index:end_index + 1]
            df_sampled = df_sampled.reset_index(drop=True)
            for k in range(N, len(df_sampled) + 1 - gap_size - N):
                start_gap = k
                df_gapped = create_gap(df_sampled.copy(), start_gap, gap_size, column)
                df_gapped=df_gapped.iloc[k - N:k + gap_size + N ]
                df_interpolated = df_gapped.interpolate(method='linear')
                diff = (df_sampled.iloc[k - N: k+ gap_size + N]['temperature'] - df_interpolated['temperature']) **2
                sum+= diff.sum()
                l+=gap_size
    linear_rmse.append(np.sqrt(sum/l))
print(linear_rmse)

done


In [4]:
#spline cubique
spline_rmse=[]



def create_gap(df, start_gap, gap_size, column):
    end_gap = start_gap + gap_size
    for i in range(start_gap, end_gap):
        df.at[i, column] = np.nan
    return df
    
for i in range(1, max_hole + 1):
    gap_size= i
    sum=0
    l=0
    for dfi in my_dfs :
        rows_with_nan = dfi[dfi.isna().any(axis=1)].index.tolist()
        rows_with_nan.insert(0, -1)
        rows_with_nan.append(len(dfi))
        for j in range(len(rows_with_nan) - 1):
            start_index= rows_with_nan[j] + 1
            end_index= rows_with_nan[j + 1] - 1
            df_sampled=dfi[start_index:end_index + 1]
            df_sampled['timestamp'] = dfi['date'].astype(np.int64) // 10**9
            df_sampled = df_sampled.reset_index(drop=True)
            for k in range(N, len(df_sampled) +1 - gap_size - N):
                start_gap = k
                df_gapped = create_gap(df_sampled.copy(), start_gap, gap_size, column)
                df_gapped=df_gapped.iloc[k - N:k + gap_size + N ]
                known_data = df_gapped.dropna(subset=['temperature'])
                missing_data = df_gapped[df_gapped['temperature'].isna()]
                x_known = known_data['timestamp'].values
                y_known = known_data['temperature'].values
                cs = CubicSpline(x_known, y_known, bc_type='natural')
                x_missing = missing_data['timestamp'].values
                y_missing_interpolated = cs(x_missing)
                df_gapped.loc[df_gapped['temperature'].isna(), 'temperature'] = y_missing_interpolated
                diff = (df_sampled.iloc[k - N: k+ gap_size + N]['temperature'] - df_gapped['temperature']) **2
                sum+= diff.sum()
                l+=gap_size
    spline_rmse.append(np.sqrt(sum/l))
print(spline_rmse)

done


In [None]:
#lstm
lstm_rmse=[]
models=[]
X_tests=[]
y_tests=[]
X=np.array([])
y=np.array([])
predictions=[]


def create_sequences(data, input_steps, output_steps, future_steps, rows_with_nan):
    X, y = [], []
    for i in range(len(data) - input_steps - output_steps - future_steps + 1):
        if any(idx in rows_with_nan for idx in range(i, i + input_steps + output_steps + future_steps)):
            continue
        past_seq = data[i:i + input_steps].values
        future_seq = data[i + input_steps + output_steps:i + input_steps + output_steps + future_steps].values
        target_seq = data[i + input_steps:i + input_steps + output_steps].values
        X.append(np.concatenate((past_seq, future_seq)))
        y.append(target_seq)
    return np.array(X), np.array(y)




def create_model(dense_size):
    model = Sequential()
    model.add(Bidirectional(LSTM(50, activation='relu'), input_shape=(N * 2, 1)))
    model.add(Dense(dense_size))
    model.compile(optimizer='adam', loss='mse')
    return model


for i in range(6):
    model=create_model(i + 1)
    models.append(model)       
d=1
for model in models:
    dfi= my_dfs[0]
    rows_with_nan = dfi[dfi.isna().any(axis=1)].index.tolist()
    l=0
    a = dfi.index[0]
    b = dfi.index[-1]
    filtered = [x for x in rows_with_nan if a <= x <= b]
    filtered.insert(a, -1)
    filtered.append(b)
    for j in range(len(rows_with_nan) - 1):
        start_index= rows_with_nan[j] + 1
        end_index= rows_with_nan[j + 1] - 1 
        df_sampled=dfi[start_index:end_index]
        df_sampled = df_sampled.reset_index(drop=True)
        Xi, yi = create_sequences(df_sampled['temperature_scaled'], N, d, N, rows_with_nan)
        if l==0:
            l=1
            X= Xi
            y=yi
        else:
            X= np.concatenate((X,Xi))
            y=np.concatenate((y,yi))
    X = X.reshape((X.shape[0], X.shape[1], 1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    history = model.fit(X_train, y_train, epochs=100, verbose=1)
    X_tests.append(X_test)
    y_tests.append(y_test)
    d+=1

for i in range(6):
    prediction_scaled = models[i].predict(X_tests[i])
    prediction=scaler.inverse_transform(prediction_scaled.reshape(-1,1))
    attendu=scaler.inverse_transform(y_tests[i])
    rmse = np.sqrt(mean_squared_error(prediction.flatten(), attendu.flatten()))
    lstm_rmse.append(rmse)

Epoch 1/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0383
Epoch 2/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.0023
Epoch 3/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 5.6544e-04
Epoch 4/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 3.7610e-04
Epoch 5/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 3.0834e-04
Epoch 6/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 2.8323e-04
Epoch 7/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 2.6811e-04
Epoch 8/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 2.5069e-04
Epoch 9/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 2.6872e-04
Epoch 10/100
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(tab_hole, linear_rmse, marker='o', linestyle='-', color='r', label="linear")
plt.plot(tab_hole, spline_rmse, marker='o', linestyle='-', color='g', label="spline")
plt.plot(tab_hole, lstm_rmse, marker='o', linestyle='-', color='b', label="lsmt")
plt.title("Erreur en fonction de la taille des trous (Bi-Lstm)")
plt.xlabel("Taille des trous")
plt.ylabel("Erreur")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#lstm
lstm1s_rmse=[]
models=[]
X_tests=[]
y_tests=[]
X_lstm=np.array([])
X_temporel=np.array([])
X_nearby=np.array([])
y=np.array([])
predictions=[]



def create_sequences(df, input_steps, output_steps, future_steps, rows_with_nan):
    X_lstm, X_temporal, X_nearby, y = [], [], [], []
    for i in range(len(df) - input_steps - output_steps - future_steps + 1):
        if any(idx in rows_with_nan for idx in range(i, i + input_steps + output_steps + future_steps)):
            continue
        # Séquences de température pour LSTM
        past_seq = df['temperature_scaled'][i:i + input_steps].values
        future_seq = df['temperature_scaled'][i + input_steps + output_steps:i + input_steps + output_steps + future_steps].values
        target_seq = df['temperature_scaled'][i + input_steps:i + input_steps + output_steps].values

        # Données temporelles pour la première date de la séquence
        start_date = df['date'].iloc[i]
        day_cos = np.cos(start_date.dayofyear / 365.25 * 2 * np.pi)
        day_sin = np.sin(start_date.dayofyear / 365.25 * 2 * np.pi)
        hour_cos = np.cos(start_date.hour / 24.0 * 2 * np.pi)
        hour_sin = np.sin(start_date.hour / 24.0 * 2 * np.pi)
        temporal_data = np.array([day_cos, day_sin, hour_cos, hour_sin])

        # Données des températures des stations voisines
        station1_data = df['temperature_scaled1'][i + input_steps:i + input_steps + output_steps].values
        nearby_data= station1_data

        X_lstm.append(np.concatenate((past_seq, future_seq)))
        X_temporal.append(temporal_data)
        X_nearby.append(nearby_data)
        y.append(target_seq)

    return np.array(X_lstm), np.array(X_temporal), np.array(X_nearby), np.array(y)




def create_model(lstm_units, dense_size, additional_dense_size1, additional_dense_size2):
    # Input pour les données séquentielles de température
    lstm_input = Input(shape=(N * 2, 1))
    lstm_out = Bidirectional(LSTM(lstm_units, activation='relu'))(lstm_input)
    
    # Input pour les données temporelles encodées
    temporal_input = Input(shape=(4,))  # Assumant 4 valeurs pour cos/sin jour et heure
    dense_temporal = Dense(additional_dense_size1, activation='relu')(temporal_input)
    
    # Input pour les températures des stations à proximité
    #nearby_temp_input = Input(shape=(3 * dense_size,))
    nearby_temp_input = Input(shape=(dense_size,))
    dense_nearby_temp = Dense(additional_dense_size2, activation='relu')(nearby_temp_input)
    
    # Concaténation des sorties
    concatenated = Concatenate()([lstm_out, dense_temporal, dense_nearby_temp])
    # Couche de sortie
    main_output = Dense(dense_size, activation='relu')(concatenated)
    # Création et compilation du modèle
    model = Model(inputs=[lstm_input, temporal_input, nearby_temp_input], outputs=main_output)
    model.compile(optimizer='adam', loss='mse')
    return model



for i in range(6):
    model=create_model(32,i + 1, 4, 2  * (i+1) )
    models.append(model)       
d=1
for model in models:
    l=0
    dfi = my_dfs[0]:
    rows_with_nan = dfi[dfi.isna().any(axis=1)].index.tolist()
    a = dfi.index[0]
    b = dfi.index[-1]
    filtered = [x for x in rows_with_nan if a <= x <= b]
    filtered.insert(a, -1)
    filtered.append(b)
    for j in range(len(filtered) - 1):
        start_index= filtered[j] + 1
        end_index= filtered[j + 1] - 1 
        df_sampled=dfi.loc[start_index:end_index]
        df_sampled = df_sampled.reset_index(drop=True)
        Xi_lstm, Xi_temporal, Xi_nearby, yi = create_sequences(df_sampled, N, d, N, rows_with_nan)
        if l==0:
            l=1
            X_lstm= Xi_lstm
            X_temporal=Xi_temporal
            X_nearby=Xi_nearby
            y=yi
        else:
            X_lstm= np.concatenate((X_lstm,Xi_lstm))
            X_temporal= np.concatenate((X_temporal,Xi_temporal))
            X_nearby= np.concatenate((X_nearby,Xi_nearby))
            y=np.concatenate((y,yi))
    X_lstm = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], 1))
    X_lstm_train, X_lstm_test, X_temporal_train, X_temporal_test, X_nearby_train, X_nearby_test, y_train, y_test = train_test_split(
    X_lstm, X_temporal, X_nearby, y, test_size=0.2, random_state=123)
    model.fit([X_lstm_train, X_temporal_train, X_nearby_train], y_train, epochs=50, verbose=1)
    X_tests.append([X_lstm_test, X_temporal_test, X_nearby_test])
    y_tests.append(y_test)
    d+=1

for i in range(6):
    prediction_scaled = models[i].predict(X_tests[i])
    prediction=scaler.inverse_transform(prediction_scaled.reshape(-1,1))
    attendu=scaler.inverse_transform(y_tests[i])
    rmse = np.sqrt(mean_squared_error(prediction.flatten(), attendu.flatten()))
    lstm1s_rmse.append(rmse)
print("done")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(tab_hole, linear_rmse, marker='o', linestyle='-', color='r', label="linear")
plt.plot(tab_hole, spline_rmse, marker='o', linestyle='-', color='g', label="spline")
plt.plot(tab_hole, lstm_rmse, marker='o', linestyle='-', color='b', label="lsmt")
plt.plot(tab_hole, lstm1s_rmse, marker='o', linestyle='-', color='y', label="lsmt_1s")
plt.title("Erreur en fonction de la taille des trous (Bi-Lstm)")
plt.xlabel("Taille des trous")
plt.ylabel("Erreur")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#lstm
lstm3s_rmse=[]
models=[]
X_tests=[]
y_tests=[]
X_lstm=np.array([])
X_temporel=np.array([])
X_nearby=np.array([])
y=np.array([])
predictions=[]



def create_sequences(df, input_steps, output_steps, future_steps, rows_with_nan):
    X_lstm, X_temporal, X_nearby, y = [], [], [], []
    for i in range(len(df) - input_steps - output_steps - future_steps + 1):
        if any(idx in rows_with_nan for idx in range(i, i + input_steps + output_steps + future_steps)):
            continue
        # Séquences de température pour LSTM
        past_seq = df['temperature_scaled'][i:i + input_steps].values
        future_seq = df['temperature_scaled'][i + input_steps + output_steps:i + input_steps + output_steps + future_steps].values
        target_seq = df['temperature_scaled'][i + input_steps:i + input_steps + output_steps].values

        # Données temporelles pour la première date de la séquence
        start_date = df['date'].iloc[i]
        day_cos = np.cos(start_date.dayofyear / 365.25 * 2 * np.pi)
        day_sin = np.sin(start_date.dayofyear / 365.25 * 2 * np.pi)
        hour_cos = np.cos(start_date.hour / 24.0 * 2 * np.pi)
        hour_sin = np.sin(start_date.hour / 24.0 * 2 * np.pi)
        temporal_data = np.array([day_cos, day_sin, hour_cos, hour_sin])

        # Données des températures des stations voisines
        station1_data = df['temperature_scaled1'][i + input_steps:i + input_steps + output_steps].values
        station2_data = df['temperature_scaled2'][i + input_steps:i + input_steps + output_steps].values
        station3_data = df['temperature_scaled3'][i+ input_steps:i + input_steps + output_steps].values
        nearby_data = np.column_stack((station1_data, station2_data, station3_data)).flatten()

        X_lstm.append(np.concatenate((past_seq, future_seq)))
        X_temporal.append(temporal_data)
        X_nearby.append(nearby_data)
        y.append(target_seq)

    return np.array(X_lstm), np.array(X_temporal), np.array(X_nearby), np.array(y)


def create_model(lstm_units, dense_size, additional_dense_size1, additional_dense_size2):
    # Input pour les données séquentielles de température
    lstm_input = Input(shape=(N * 2, 1))
    lstm_out = Bidirectional(LSTM(lstm_units, activation='relu'))(lstm_input)
    
    # Input pour les données temporelles encodées
    temporal_input = Input(shape=(4,))  # Assumant 4 valeurs pour cos/sin jour et heure
    dense_temporal = Dense(additional_dense_size1, activation='relu')(temporal_input)
    
    # Input pour les températures des stations à proximité
    nearby_temp_input = Input(shape=(3 * dense_size,))
    #nearby_temp_input = Input(shape=(dense_size,))
    dense_nearby_temp = Dense(additional_dense_size2, activation='relu')(nearby_temp_input)
    
    # Concaténation des sorties
    concatenated = Concatenate()([lstm_out, dense_temporal, dense_nearby_temp])
    # Couche de sortie
    main_output = Dense(dense_size, activation='relu')(concatenated)
    # Création et compilation du modèle
    model = Model(inputs=[lstm_input, temporal_input, nearby_temp_input], outputs=main_output)
    model.compile(optimizer='adam', loss='mse')
    return model



for i in range(6):
    model=create_model(32,i + 1, 4, 2  * (i+1) )
    models.append(model)       
d=1
for model in models:
    l=0
    dfi = my_dfs[0]:
    rows_with_nan = dfi[dfi.isna().any(axis=1)].index.tolist()
    a = dfi.index[0]
    b = dfi.index[-1]
    filtered = [x for x in rows_with_nan if a <= x <= b]
    filtered.insert(a, -1)
    filtered.append(b)
    for j in range(len(filtered) - 1):
        start_index= filtered[j] + 1
        end_index= filtered[j + 1] - 1 
        df_sampled=dfi.loc[start_index:end_index]
        df_sampled = df_sampled.reset_index(drop=True)
        Xi_lstm, Xi_temporal, Xi_nearby, yi = create_sequences(df_sampled, N, d, N, rows_with_nan)
        if l==0:
            l=1
            X_lstm= Xi_lstm
            X_temporal=Xi_temporal
            X_nearby=Xi_nearby
            y=yi
        else:
            X_lstm= np.concatenate((X_lstm,Xi_lstm))
            X_temporal= np.concatenate((X_temporal,Xi_temporal))
            X_nearby= np.concatenate((X_nearby,Xi_nearby))
            y=np.concatenate((y,yi))
    X_lstm = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], 1))
    X_lstm_train, X_lstm_test, X_temporal_train, X_temporal_test, X_nearby_train, X_nearby_test, y_train, y_test = train_test_split(
    X_lstm, X_temporal, X_nearby, y, test_size=0.2, random_state=123)
    model.fit([X_lstm_train, X_temporal_train, X_nearby_train], y_train, epochs=100, verbose=1)
    X_tests.append([X_lstm_test, X_temporal_test, X_nearby_test])
    y_tests.append(y_test)
    d+=1

for i in range(6):
    prediction_scaled = models[i].predict(X_tests[i])
    prediction=scaler.inverse_transform(prediction_scaled.reshape(-1,1))
    attendu=scaler.inverse_transform(y_tests[i])
    rmse = np.sqrt(mean_squared_error(prediction.flatten(), attendu.flatten()))
    lstm3s_rmse.append(rmse)
print("done")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(tab_hole, linear_rmse, marker='o', linestyle='-', color='r', label="linear")
plt.plot(tab_hole, spline_rmse, marker='o', linestyle='-', color='g', label="spline")
plt.plot(tab_hole, lstm_rmse, marker='o', linestyle='-', color='b', label="lsmt")
plt.plot(tab_hole, lstm_rmse1, marker='o', linestyle='-', color='y', label="lsmt_1s")
plt.plot(tab_hole, lstm_rmse3, marker='o', linestyle='-', color='m', label="lsmt_3s")
plt.title("Erreur en fonction de la taille des trous (Bi-Lstm)")
plt.xlabel("Taille des trous")
plt.ylabel("Erreur")
plt.grid(True)
plt.legend()
plt.show()