# Projet Météo.

#### Copyright(C) 2024, Charles Theetten, <chalimede@proton.me>

In [1]:
################################################################################

from    imblearn.metrics            import classification_report_imbalanced

from    sklearn.ensemble            import StackingClassifier

from    sklearn.model_selection     import cross_validate
from    sklearn.model_selection     import train_test_split

from    sklearn.preprocessing       import StandardScaler

################################################################################

import  matplotlib.pyplot           as plt

from    plotly                      import graph_objs as go
from    plotly.subplots             import make_subplots

################################################################################

import  joblib
import  os
import  numpy                       as np
import  pandas                      as pd

################################################################################

from    nan_values                  import fillna_knn
from    nan_values                  import fillna_rain_today

################################################################################

from    params_dl                   import IxHyperModel
from    params_dl                   import IdxHyperModel
from    params_dl                   import IddxHyperModel
from    params_dl                   import IdOdxHyperModel
from    params_dl                   import SrnnHyperModel
from    params_dl                   import LSTMHyperModel
from    params_dl                   import TestHyperModel

################################################################################

from    data                        import DataMeteo
from    dl_models                   import DLModels
from    ml_models                   import MlModels
from    params_ml                   import space_lgb
from    params_ml                   import space_lrc
from    params_ml                   import space_knn
from    params_ml                   import space_rfc
from    results                     import Results

from    transformers                import TrCleanCloud
from    transformers                import TrCleanNaNRainTomorrow
from    transformers                import TrCleanRowDate
from    transformers                import TrCleanNaNRow
from    transformers                import TrCleanRainTomorrow
from    transformers                import TrClimaticClusters
from    transformers                import TrDaysOfMonth
from    transformers                import TrDaysOfYear
from    transformers                import TrDiscretizeCloud
from    transformers                import TrDiscretizeRain
from    transformers                import TrDiscretizeWindDirection
from    transformers                import TrGPS
from    transformers                import TrSubsetNaN
from    transformers                import TrZonesRain

################################################################################

pd.set_option("display.max_columns", None)
pd.set_option("display.width" , 1000)

pd.set_option("mode.chained_assignment", None)
pd.set_option("future.no_silent_downcasting", True)

################################################################################

FILE = "../csv/meteo_australia_2007_2017.csv"

################################################################################




## Construction des jeux de données

In [None]:
tr_clean_cloud          = TrCleanCloud()                                            # création du transformeur clean_cloud
tr_clean_nan_raintom    = TrCleanNaNRainTomorrow()                                  # création du transformeur rain_clean_nan_tomorrow
tr_clean_nan_row        = TrCleanNaNRow()                                           # création du transformeur clean_nan_row
tr_clean_raintom        = TrCleanRainTomorrow()                                     # création du transformeur clean_raintom
tr_clean_row_date       = TrCleanRowDate(2009)                                      # création du transformeur clean_row_date
tr_climatic_clusters    = TrClimaticClusters()                                      # création du transformeur climatic_clusters
tr_days_month           = TrDaysOfMonth()                                           # création du transformeur days_month
tr_days_year            = TrDaysOfYear()                                            # création du transformeur days_year
tr_discretize_cloud     = TrDiscretizeCloud()                                       # création du transformeur discrretize_cloud
tr_discretize_rain      = TrDiscretizeRain()                                        # création du transformeur discretize_rain
tr_discretize_wind_dir  = TrDiscretizeWindDirection()                               # création du transformeur discretize_wind_direction
tr_gps                  = TrGPS()                                                   # création du transformeur gps
tr_subset_nan           = TrSubsetNaN(49)                                           # création du transformeur subset_nan
tr_zones_rain           = TrZonesRain()                                             # création du transformeur tr_zones_rain

################################################################################

data_meteo = DataMeteo(FILE)                                                        # instantiation de la classe DataMeteo

################################################################################

data_meteo.convert_to_datetime("Date")                                              # conversion de la colonne date au format datetime

################################################################################

transformers    = [ ("tr_clean_nan_row",        tr_clean_nan_row),                  # suppression des lignes vides
                    ("tr_clean_row_date",       tr_clean_row_date),                 # extraction des lignes >= 2009
                    ("tr_clean_nan_raintom",    tr_clean_nan_raintom),              # suppression des variables cibles NaN
                    ("tr_subset_nan",           tr_subset_nan),                     # extraction du sous-ensemble
                    ("tr_clean_cloud",          tr_clean_cloud),                    # nettoyage de la variable cloud
                    ("tr_discretize_rain",      tr_discretize_rain),                # discrétisation des variables de pluie
                    ("tr_discretize_wind_dir",  tr_discretize_wind_dir) ]           # discrétisation des variables de vent

data_meteo.build_dataset(transformers = transformers)                               # construction du nouveau dataset

################################################################################

data_meteo.change_type_columns(["RainTomorrow", "RainToday"], np.float64)           # conversion de type pour le remplacement des nan

################################################################################

data_meteo.data = fillna_rain_today(data_meteo.data)
data_meteo.data = fillna_knn(data_meteo.data, 5, "distance", "nan_euclidean")       # remplissage des valeurs manquantes

################################################################################

transformers = [("tr_discretize_cloud", tr_discretize_cloud),
                ("tr_clean_raintom",    tr_clean_raintom) ]

data_meteo.build_dataset(transformers = transformers)                               # Nettoyage final

################################################################################

data_meteo.display_info_data(10)                                                    # affichage des informations de base sur le nouveau dataset
data_meteo.display_percentage_nan()                                                 # on s"assure qu"il n"y ait pas de valeurs manquantes

################################################################################

data_meteo.data.to_csv("../csv/tests/df_49_knn.csv")

In [None]:
df_49_knn = pd.read_csv("../csv/subsets/df_49_knn.csv")

cols = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustSpeed", "WindSpeed9am", "WindSpeed3pm",
        "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am", "Cloud3pm", "Temp9am", "Temp3pm"]

fig = make_subplots(rows = 1, cols = len(cols))

for i, var in enumerate(cols):
    fig.add_trace(go.Box(y    = df_49_knn[var],
                         name = var),
                         row  = 1, col = i + 1)

fig.update_layout(title_text            = "Distribution des variables sur l'ensemble KNN",
                  title_font_size       = 45,
                  height = 600, width   = 2192,
                  template              = "plotly_dark")
fig.update_traces(jitter = .05)

### Imputation KMeans

In [9]:
from sklearn.cluster        import KMeans
from scipy.spatial.distance import cdist

def extract_clean_data(data, city):
    clean_data              = data[data["Location"] == city]
    clean_data              = clean_data.drop(["Date", "Location", "RainTomorrow"], axis = 1)
    clean_data["RainToday"] = clean_data["RainToday"].astype(np.float64)
    return clean_data

def get_distorsions(data, nb_clusters):
    distorsions = []

    for n in range(1, nb_clusters):
        cluster = KMeans(n_clusters = n, random_state = 123)
        cluster.fit(data)
        distorsions.append(sum(np.min(cdist(data, cluster.cluster_centers_, metric = "euclidean"), axis = 1)) / np.size(data, axis = 0))
    return distorsions, cluster.cluster_centers_[0:6]

In [17]:
tr_clean_cloud          = TrCleanCloud()                                                    # création du transformeur clean_cloud
tr_clean_nan_raintom    = TrCleanNaNRainTomorrow()                                          # création du transformeur rain_clean_nan_tomorrow
tr_clean_nan_row        = TrCleanNaNRow()                                                   # création du transformeur clean_nan_row
tr_clean_raintom        = TrCleanRainTomorrow()                                             # création du transformeur clean_raintom
tr_clean_row_date       = TrCleanRowDate(2009)                                              # création du transformeur clean_row_date
tr_climatic_clusters    = TrClimaticClusters()                                              # création du transformeur climatic_clusters
tr_days_month           = TrDaysOfMonth()                                                   # création du transformeur days_month
tr_days_year            = TrDaysOfYear()                                                    # création du transformeur days_year
tr_discretize_cloud     = TrDiscretizeCloud()                                               # création du transformeur discrretize_cloud
tr_discretize_rain      = TrDiscretizeRain()                                                # création du transformeur discretize_rain
tr_discretize_wind_dir  = TrDiscretizeWindDirection()                                       # création du transformeur discretize_wind_direction
tr_gps                  = TrGPS()                                                           # création du transformeur gps
tr_subset_nan           = TrSubsetNaN(49)                                                   # création du transformeur subset_nan
tr_zones_rain           = TrZonesRain()                                                     # création du transformeur tr_zones_rain

kmeans_data             = DataMeteo(FILE)

kmeans_data.convert_to_datetime("Date")

transformers            = [ ("tr_clean_nan_row",        tr_clean_nan_row),                  # suppression des lignes vides
                            ("tr_clean_row_date",       tr_clean_row_date),                 # extraction des lignes >= 2009
                            ("tr_clean_nan_raintom",    tr_clean_nan_raintom),              # suppression des variables cibles NaN
                            ("tr_subset_nan",           tr_subset_nan),                     # extraction du sous-ensemble
                            ("tr_clean_cloud",          tr_clean_cloud),                    # nettoyage de la variable cloud
                            ("tr_discretize_rain",      tr_discretize_rain),                # discrétisation des variables de pluie
                            ("tr_discretize_wind_dir",  tr_discretize_wind_dir) ]           # discrétisation des variables de vent

kmeans_data.build_dataset(transformers = transformers)

clean_data              = kmeans_data.data.dropna()

clean_data_sydney       = extract_clean_data(clean_data, "Sydney")
clean_data_darwin       = extract_clean_data(clean_data, "Darwin")
clean_data_alice        = extract_clean_data(clean_data, "AliceSprings")
clean_data_perth        = extract_clean_data(clean_data, "Perth")
clean_data_towns        = extract_clean_data(clean_data, "Townsville")
clean_data_brisb        = extract_clean_data(clean_data, "Brisbane")
clean_data_mildu        = extract_clean_data(clean_data, "Mildura")

clean_data              = clean_data.drop(["Date", "Location", "RainTomorrow"], axis = 1)
clean_data["RainToday"] = clean_data["RainToday"].astype(np.float64)

distorsions, centroids  = get_distorsions(clean_data, 21)

d_sydney, c_sydney      = get_distorsions(clean_data_sydney, 21)
d_darwin, c_darwin      = get_distorsions(clean_data_darwin, 21)
d_alice, c_alice        = get_distorsions(clean_data_alice, 21)
d_perth, c_perth        = get_distorsions(clean_data_perth, 21)
d_towns, c_towns        = get_distorsions(clean_data_towns, 21)
d_brisb, c_brisb        = get_distorsions(clean_data_brisb, 21)
d_mildu, c_mildu        = get_distorsions(clean_data_mildu, 21)

In [None]:
nb_clusters = [i for i in range(1, 21)]

fig = make_subplots(rows = 2, cols = 4, shared_yaxes = True)

fig.add_trace(go.Scatter(x = nb_clusters, y = distorsions, mode = "lines + markers", xaxis="x1"), row = 1, col = 1)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_sydney, mode    = "lines + markers", xaxis="x2"), row = 1, col = 2)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_darwin, mode    = "lines + markers", xaxis="x3"), row = 1, col = 3)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_alice, mode     = "lines + markers", xaxis="x4"), row = 1, col = 4)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_perth, mode     = "lines + markers", xaxis="x1"), row = 2, col = 1)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_towns, mode     = "lines + markers", xaxis="x2"), row = 2, col = 2)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_brisb, mode     = "lines + markers", xaxis="x3"), row = 2, col = 3)

fig.add_trace(go.Scatter(x = nb_clusters, y = d_mildu, mode     = "lines + markers", xaxis="x4"), row = 2, col = 4)

fig.update_layout(width = 1920, height  = 1080,
                  title_font_size       = 30,
                  title_text            = "Nombre de clusters optimal sur le jeu de données complet et par villes",
                  xaxis1                = dict(title = "Australie"),
                  xaxis2                = dict(title = "Sydney"),
                  xaxis3                = dict(title = "Darwin"),
                  xaxis4                = dict(title = "AliceSprings"),
                  xaxis5                = dict(title = "Perth"),
                  xaxis6                = dict(title = "Townsville"),
                  xaxis7                = dict(title = "Brisbane"),
                  xaxis8                = dict(title = "Mildura"),
                  )

fig.show()

In [None]:
fig         = make_subplots(rows = 1, cols = 4)

fig.add_trace(go.Scatter(x = clean_data["Humidity9am"].values[0:len(clean_data)], y = clean_data["Evaporation"].values[0:len(clean_data)],
                         mode = "markers", marker = dict(size = 2)), row = 1, col = 1)
fig.add_trace(go.Scatter(x = centroids[:, 8], y = centroids[:, 3],
                         mode = "markers", marker = dict(size = 12, color = "RED")), row = 1, col = 1)

################################################################################

fig.add_trace(go.Scatter(x = clean_data_sydney["Humidity9am"].values[0:len(clean_data_sydney)], y = clean_data_sydney["Evaporation"].values[0:len(clean_data_sydney)],
                         mode = "markers", marker = dict(size = 3)), row = 1, col = 2)
fig.add_trace(go.Scatter(x = c_sydney[:, 8], y = c_sydney[:, 3],
                         mode = "markers", marker = dict(size = 12, color = "SILVER")), row = 1, col = 2)

################################################################################

fig.add_trace(go.Scatter(x = clean_data_darwin["Humidity9am"].values[0:len(clean_data_darwin)], y = clean_data_darwin["Evaporation"].values[0:len(clean_data_darwin)],
                         mode = "markers", marker = dict(size = 3)), row = 1, col = 3)
fig.add_trace(go.Scatter(x = c_darwin[:, 8], y = c_darwin[:, 3],
                         mode = "markers", marker = dict(size = 12, color = "Aqua")), row = 1, col = 3)

################################################################################

fig.add_trace(go.Scatter(x = clean_data_alice["Humidity9am"].values[0:len(clean_data_alice)], y = clean_data_alice["Evaporation"].values[0:len(clean_data_alice)],
                         mode = "markers", marker = dict(size = 3, color = "#5DADE2")), row = 1, col = 4)
fig.add_trace(go.Scatter(x = c_alice[:, 8], y = c_alice[:, 3],
                         mode = "markers", marker = dict(size = 12, color = "LIME")), row = 1, col = 4)


fig.update_layout(width = 2192, height  = 720,
                  title_font_size       = 40,
                  title_text            = "Clusters sur l'évaporation en fonction de l'humidité sur l'Australie et différentes villes",
                  template              = "plotly_dark")

fig.show()

## Évaluation des différents modèles

### Machine Learning

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
MODEL               = "LGB"

ml_meteo            = MlModels()
results             = Results()

ml_meteo.init_data(FILE, 2009, 2017)
ml_meteo.split_data("RainTomorrow", 0.20)
ml_meteo.scale_data()

print(ml_meteo.data.shape)

clf, params         = ml_meteo.search_model(space_lgb, MODEL)
eval                = ml_meteo.eval_model(clf, params)

results.init_models(MODEL)
results.register_ml(MODEL, clf, eval)
results.write_ml_results(MODEL, f"../results/results_{MODEL}_49.txt")
results.persist_ml_model(MODEL, "../models/tests/", "_knn_49")

#### Métriques sur le jeu de données complet

In [None]:
algorithms  = ["LRC", "KNN", "RFC", "LGB"]

fig         = go.Figure(data = [ go.Bar(name = "Train score",   x = algorithms, y = [0.868, 1.0, 0.936 , 0.961]),
                                 go.Bar(name = "Test score",    x = algorithms, y = [0.888, 0.895, 0.938 , 0.944]),
                                 go.Bar(name = "Recall",        x = algorithms, y = [0.76, 0.68, 0.81, 0.84]),
                                 go.Bar(name = "F1",            x = algorithms, y = [0.75, 0.74, 0.85, 0.87])])

fig.update_layout(width = 1920, height  = 1080,
                  title_font_size       = 30,
                  title_text            = "Métriques des différents modèles de prédiction sur le jeu de données complet",
                  barmode               = "group",
                  template              = "plotly_dark")

fig.update_xaxes(tickfont = dict(size = 40))
fig.update_yaxes(tickfont = dict(size = 20), tickmode = "linear", range = [0, 1], dtick = 0.05)
fig.show()

#### Sous-ensembles

In [None]:
MODEL   = "RFC"
scores  = []

for file in os.listdir("../csv/subsets/"):
    ml_meteo    = MlModels()
    results     = Results()

    ml_meteo.init_data("../csv/subsets/" + file, 2009, 2017)
    ml_meteo.split_data("RainTomorrow", 0.20)
    ml_meteo.scale_data()
    clf, params = ml_meteo.search_model(space_rfc, MODEL)
    eval        = ml_meteo.eval_model(clf, params)

    results.init_models(MODEL)
    results.register_ml(MODEL, clf, eval)
    scores.append(np.round(results.models[MODEL]["tes"], decimals = 3))

In [None]:
print(scores)

In [None]:
nb_cities   = [ 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 49 ]
scores      = [ [ 0.841, 0.854, 0.851, 0.848, 0.855, 0.894, 0.913, 0.922, 0.930, 0.938, 0.944 ],
                [ 0.821, 0.852, 0.842, 0.838, 0.841, 0.850, 0.858, 0.867, 0.878, 0.887, 0.894 ],
                [ 0.829, 0.852, 0.851, 0.845, 0.849, 0.877, 0.899, 0.916, 0.924, 0.933, 0.939 ], ]
algorithms  = [ "LightGBM", "KNN", "Random Forests" ]

sum_nan     = [ 106,   153,   185,   196,   216,   374,   432,   617,  1096,
                1162,  1261,  1370,  1529,  1575,  1812,  2125,  2505,  2740,
                3490,  3518,  3600,  4038,  4392,  5787,  6083,  6467,  6563,
                7075,  7184,  7991,  8361,  8368,  8847,  9997,  10105, 10255,
                10859, 11218, 12791, 12840, 13195, 13314, 13524, 13537, 13648,
                18854, 19564, 21801, 26528 ]
ecdf        = []
cum_sum     = 0

for elem in sum_nan:
    cum_sum = cum_sum + elem
    ecdf.append(cum_sum)

fig         = make_subplots(rows = 1, cols = 2, subplot_titles = [ "Scores des tests en fonction du nombre de villes",
                                                                   "Somme cumulée des valeurs manquantes en fonction du nombre de villes" ])

for i, algo in enumerate(algorithms):
    fig.add_trace(go.Scatter(x      = nb_cities,
                             y      = scores[i],
                             mode   = "lines + markers",
                             line   = dict(width = 3),
                             name   = algo), row = 1, col = 1,)

fig.add_trace(go.Scatter(x      = [i for i in range(1, 50)],
                         y      = ecdf,
                         mode   = "lines + markers",
                         line   = dict(width = 1, color = "orange"),
                         name   = "ecdf"), row = 1, col = 2)

fig.update_layout(width = 1920, height  = 720,
                  title_font_size       = 20,
                  title_text            = "Croissances comparées des scores de tests et des valeurs manquantes en fonction du nombre de villes triées par valeurs manquantes - KNN Imputation",
                  xaxis                 = dict(tickmode = "linear", tick0 = 0, dtick = 5),
                  xaxis2                = dict(tickmode = "linear", tick0 = 0, dtick = 5),
                  template              = "plotly_dark")

fig.update_xaxes(title_font_size    = 20,
                 tickfont           = dict(size = 20))
fig.update_yaxes(tickfont = dict(size = 20))

fig.show()

#### Sous-échantillonage et sur-échantillonage du modèle LightGBM

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
MODEL               = "LGB"

ml_meteo            = MlModels()
results             = Results()

ml_meteo.init_data(FILE, 2009, 2017)
ml_meteo.split_data("RainTomorrow", 0.2)
ml_meteo.scale_data()
ml_meteo.under_sample_data(strategy = "majority", voting = "hard")

print(ml_meteo.data.shape)

clf, params = ml_meteo.search_model(space_lgb, MODEL)
eval        = ml_meteo.eval_model(clf, params)

results.init_models(MODEL)
results.register_ml(MODEL, clf, eval)
results.print_ml_results(MODEL)

In [None]:
algorithms  = [ "LGB" ]

fig         = go.Figure(data = [ go.Bar(name = "Test score", x = algorithms, y = [ 0.944 ], marker_color = "lime"),
                                 go.Bar(name = "Hard score", x = algorithms, y = [ 0.921 ], marker_color = "#5DADE2"),
                                 go.Bar(name = "Soft score", x = algorithms, y = [ 0.753 ], marker_color = "orange")])

fig.update_layout(width = 1080, height  = 720,
                  title_font_size       = 20,
                  title_text            = "Sous-échantillonage de la classe majoritaire sur le jeu de données complet avec ClusterCentroids",
                  barmode               = "group",
                  template              = "plotly_dark",
                  bargroupgap = 0.1)

fig.update_xaxes(tickfont = dict(size = 40))
fig.update_yaxes(tickfont = dict(size = 20), tickmode = "linear", range = [0.5, 1], dtick = 0.05)
fig.show()

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
MODEL               = "LGB"

ml_meteo            = MlModels()
results             = Results()

ml_meteo.init_data(FILE, 2009, 2017)
ml_meteo.split_data("RainTomorrow", 0.25)
ml_meteo.scale_data()
ml_meteo.over_sample_data(strategy = "minority")

print(ml_meteo.data.shape)

clf, params = ml_meteo.search_model(space_lgb, MODEL)
eval        = ml_meteo.eval_model(clf, params)

results.init_models(MODEL)
results.register_ml(MODEL, clf, eval)
results.print_ml_results(MODEL)

In [None]:
algorithms  = [ "LGB" ]

fig         = go.Figure(data = [ go.Bar(name = "Test score", x = algorithms, y = [ 0.944 ], marker_color = "lime"),
                                 go.Bar(name = "Minority score", x = algorithms, y = [ 0.938 ], marker_color = "#5DADE2"),
                                 go.Bar(name = "All score", x = algorithms, y = [ 0.939 ], marker_color = "orange") ])

fig.update_layout(width = 1080, height  = 720,
                  title_font_size       = 20,
                  title_text            = "Sur-échantillonage de la classe majoritaire sur le jeu de données complet avec SMOTE",
                  barmode               = "group",
                  template              = "plotly_dark",
                  bargroupgap = 0.1)

fig.update_xaxes(tickfont = dict(size = 40))
fig.update_yaxes(tickfont = dict(size = 20), tickmode = "linear", range = [0.5, 1], dtick = 0.05)
fig.show()

#### Stacking Classifier

In [None]:
FILE                    = "../csv/subsets/df_49_knn.csv"

data                    = pd.read_csv(FILE)
data["Date"]            = pd.to_datetime(data["Date"])
data["RainTomorrow"]    = data["RainTomorrow"].astype(np.int8)
data                    = data.sort_values(by = ["Date"])
data                    = data.drop(["Date", "Location"], axis = 1)

x                       = data.drop("RainTomorrow", axis = 1)
y                       = data["RainTomorrow"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = False, random_state = 123)
scaler                  = StandardScaler()
x_train                 = scaler.fit_transform(x_train)
x_test                  = scaler.transform(x_test)

lrcc                    = joblib.load("../models/bests/LRC_knn_49.joblib")
knnc                    = joblib.load("../models/bests/KNN_knn_49.joblib")
rfcc                    = joblib.load("../models/bests/RFC_knn_49.joblib")
lgbc                    = joblib.load("../models/bests/LGB_knn_49.joblib")

sclf                    = StackingClassifier(estimators = [ ("lrrc", lrcc), ("knnc", knnc), ("rfcc", rfcc), ("lgbc", lgbc) ], final_estimator = lgbc)
scores                  = cross_validate(sclf, x_train, y_train, scoring = [ "accuracy" ])

sclf.fit(x_train, y_train)

y_pred                  = sclf.predict(x_test)
train_score             = sclf.score(x_train, y_train)
test_score              = sclf.score(x_test, y_test)
crosstab                = pd.crosstab(y_test, y_pred)
report                  = classification_report_imbalanced(y_test, y_pred)

print(f"Train Score             : {train_score:.3f}")
print(f"Test Score              : {test_score:.3f}")
print(f"Confusion matrix        :\n\n{crosstab}\n\n")
print(f"Classification report   :\n{report}\n")

#### Importance des variables explicatives sur le modèle LightGBM

In [None]:
from lightgbm import plot_importance

FILE                    = "../csv/subsets/df_49_knn.csv"

data                    = pd.read_csv(FILE)
data["Date"]            = pd.to_datetime(data["Date"])
data["RainTomorrow"]    = data["RainTomorrow"].astype(np.int8)
data                    = data.sort_values(by = ["Date"])
data                    = data.drop(["Date", "Location"], axis = 1)

x                       = data.drop("RainTomorrow", axis = 1)
y                       = data["RainTomorrow"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, shuffle = False, random_state = 123)
scaler                  = StandardScaler()
x_train                 = scaler.fit_transform(x_train)
x_test                  = scaler.transform(x_test)

lgbc                    = joblib.load("../models/bests/LGB_knn_49.joblib")

y_pred                  = lgbc.predict(x_test)
train_score             = lgbc.score(x_train, y_train)
test_score              = lgbc.score(x_test, y_test)
crosstab                = pd.crosstab(y_test, y_pred)
report                  = classification_report_imbalanced(y_test, y_pred)

print(f"Train Score             : {train_score:.3f}")
print(f"Test Score              : {test_score:.3f}")
print(f"Confusion matrix        :\n\n{crosstab}\n\n")
print(f"Classification report   :\n{report}\n")

################################################################################

ax                      = plot_importance(lgbc,
                                          importance_type = "gain",
                                          figsize=(16, 16),
                                          title = "LightGBM - Importance des variables explicatives [ Gain ]")

ax.set_yticklabels(x.columns)
plt.show()

#### Arbre d'exécution LightGBM

In [None]:
from lightgbm import plot_tree

ax = plot_tree(lgbc, tree_index = 0, figsize = (4, 16), dpi = 1200)
plt.show()

### Réseaux de neurones

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/meteo_01"
MODEL               = "Neural_Net_01"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

model               = IxHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 10,
                                            project = PROJECT)

eval                = dl_meteo.eval_dense_model(model         = clf,
                                                best_params   = best_params,
                                                epochs        = 100,
                                                split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/meteo_02"
MODEL               = "Neural_Net_02"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

print(dl_meteo.data.shape)

model               = IdxHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 10,
                                            project = PROJECT)

eval                = dl_meteo.eval_dense_model(model         = clf,
                                          best_params   = best_params,
                                          epochs        = 100,
                                          split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/meteo_03"
MODEL               = "Neural_Net_03"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

print(dl_meteo.data.shape)

model               = IddxHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 10,
                                            project = PROJECT)

eval                = dl_meteo.eval_dense_model(model         = clf,
                                                best_params   = best_params,
                                                epochs        = 100,
                                                split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/meteo_04"
MODEL               = "Neural_Net_04"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

print(dl_meteo.data.shape)

model               = IdOdxHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 50,
                                            project = PROJECT)

eval                = dl_meteo.eval_dense_model(model         = clf,
                                                best_params   = best_params,
                                                epochs        = 100,
                                                split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/meteo_05"
MODEL               = "Neural_Net_05"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

print(dl_meteo.data.shape)

dl_meteo.x_train    = np.reshape(dl_meteo.x_train, (dl_meteo.x_train.shape[0], dl_meteo.x_train.shape[1], 1))
dl_meteo.x_test     = np.reshape(dl_meteo.x_test, (dl_meteo.x_test.shape[0], dl_meteo.x_test.shape[1], 1))

model               = SrnnHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 50,
                                            project = PROJECT)

eval                = dl_meteo.eval_recurrent_model(model         = clf,
                                                    best_params   = best_params,
                                                    epochs        = 100,
                                                    split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")

In [None]:
FILE                = "../csv/subsets/df_49_knn.csv"
PROJECT             = "../models/deep_learning/trials/test"
MODEL               = "Neural_Net_Test"

dl_meteo            = DLModels()
results             = Results()

dl_meteo.init_data(FILE, 2009, 2017)
dl_meteo.split_data("RainTomorrow")
dl_meteo.scale_data()

print(dl_meteo.data.shape)

dl_meteo.x_train    = np.reshape(dl_meteo.x_train, (dl_meteo.x_train.shape[0], dl_meteo.x_train.shape[1], 1))
dl_meteo.x_test     = np.reshape(dl_meteo.x_test, (dl_meteo.x_test.shape[0], dl_meteo.x_test.shape[1], 1))

model               = LSTMHyperModel()

clf, best_params    = dl_meteo.search_model(model   = model,
                                            epochs  = 50,
                                            project = PROJECT)

eval                = dl_meteo.eval_model(model         = clf,
                                          best_params   = best_params,
                                          epochs        = 100,
                                          split         = 0.2)

results.init_models(MODEL)
results.register_dl(MODEL, clf, eval)
results.write_dl_results(MODEL, f"../results/deep_learning/results_{MODEL}_49.txt")
results.persist_dl_model(clf, MODEL, "../models/deep_learning/dense_neural_networks/", "_knn_49")