In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import seaborn as sns
import plotly.graph_objects as go

In [None]:
dataPath = Path("../..") / "validation-meteo-data" / "donneesmeteo_2010-2024" / "donneesmeteo_2010-2024.csv"
meteodf = pd.read_csv(dataPath, sep=";")
meteodf.head()

In [None]:
print("Parameters:", meteodf.libellecourt.unique())
meteodf.shape

In [None]:
meteodf.fillna({"valeurorigine": meteodf.valeur}, inplace=True) # NaNs in valeurorigine means that there has been no correction of the original value
meteodf

In [None]:
meteodf.replace(to_replace={"valeurorigine":-999}, value=np.nan, inplace=True) # -999 can be considered as actual NaNs
meteodf

In [6]:
meteodfToUse = meteodf[meteodf.columns[:5]] # For the classification model we only need the first 5 columns

In [None]:
meteodfToUse.dropna(inplace=True) # Remove the nans (-999 value in valeurorigine) because they are easily classified as to be corrected


In [8]:
meteodfToUse.reset_index(drop=True, inplace=True)

In [None]:
meteodfToUse.datemesure = pd.to_datetime(meteodfToUse.datemesure.apply(lambda value: value.split(" ")[0])) # all measurements date at 00:00:00 as hour so we can just drop it

In [10]:
meteodfToUse = meteodfToUse[meteodfToUse.libellecourt != "INST"] #The INST parameter is used only on 2010 and has been corrected only once

In [None]:
meteodfToUse["correction"] = (meteodfToUse.valeur != meteodfToUse.valeurorigine).astype(int) # values that have been corrected have valeurorigine different from valeur (threshold selection can be inserted here to avoid little corrections)
meteodfToUse

In [None]:

correctiondf = meteodfToUse.groupby([meteodfToUse.datemesure.dt.year, meteodfToUse.libellecourt]).correction.sum().reset_index()
correctiondf

In [None]:
def plotVarDateParameter(df, varToPlot, dateUnit, varToPlotName=None, normalize=False):
    if varToPlotName is None:
        varToPlotName = varToPlot
    parameters = df.libellecourt.unique()
    fig = go.Figure()

    for parameter in parameters:
        data = df[df.libellecourt == parameter]
        if normalize:
            normalization = data[varToPlot].abs().max()
        else:
            normalization = 1
        fig.add_trace(
            go.Scatter(x=data.datemesure, y=data[varToPlot] / normalization, name=parameter)
        )
    fig.update_layout(title=dict(text=varToPlotName + " per " + dateUnit), 
                    xaxis=dict(title=dict(text=dateUnit)),
                    yaxis=dict(title=dict(text=varToPlotName)))
    fig.show()
    
plotVarDateParameter(correctiondf, "correction", "year", varToPlotName="Corrections")

In [None]:
correctiondfM = meteodfToUse.groupby([meteodfToUse.datemesure.dt.month, meteodfToUse.libellecourt]).correction.sum().reset_index()
correctiondfM

In [None]:
plotVarDateParameter(correctiondfM, "correction", "month", "Corrections")

In [None]:
valeurdfM = meteodfToUse.groupby([meteodfToUse.datemesure.dt.month, meteodfToUse.libellecourt]).valeur.mean().reset_index()
valeurdfM

In [None]:
plotVarDateParameter(valeurdfM, "valeur", "month", varToPlotName="valeur normalized",normalize=True)
plotVarDateParameter(valeurdfM, "valeur", "month", varToPlotName="valeur")

In [None]:
meteodfToUse["difference"] = meteodfToUse.valeur - meteodfToUse.valeurorigine

meteodfToUse

In [None]:
correctedDF = meteodfToUse[meteodfToUse.correction == 1]
correctedDF

In [None]:
differencedfY = correctedDF.groupby([correctedDF.datemesure.dt.year, correctedDF.libellecourt]).difference.mean().reset_index()
valOrigCorrectedDfY = correctedDF.groupby([correctedDF.datemesure.dt.year, correctedDF.libellecourt]).valeurorigine.mean().reset_index()
differencedfY = differencedfY.merge(valOrigCorrectedDfY)
differencedfY["relative_corr"] = differencedfY.difference / differencedfY.valeurorigine
differencedfY

In [None]:
plotVarDateParameter(differencedfY, "difference", "year", "valeur - valeurorigine", normalize=True)
plotVarDateParameter(differencedfY, "relative_corr", "year", "(valeur - valeurorigine)/valeurorigine")

In [None]:
differencedfM = correctedDF.groupby([correctedDF.datemesure.dt.month, correctedDF.libellecourt]).difference.mean().reset_index()
valOrigCorrectedDfM = correctedDF.groupby([correctedDF.datemesure.dt.month, correctedDF.libellecourt]).valeurorigine.mean().reset_index()
differencedfM = differencedfM.merge(valOrigCorrectedDfM)
differencedfM["relative_corr"] = differencedfM.difference / differencedfM.valeurorigine
differencedfM

In [None]:
plotVarDateParameter(differencedfM, "difference", "month", "valeur - valeurorigine", normalize=True)
plotVarDateParameter(differencedfM, "relative_corr", "month", "(valeur - valeurorigine)/valeurorigine")