# Passage en donnée catégorielle pour la détection de fréquences récurrentes

### Import libraries

In [21]:
import pandas as pd
import numpy as np

### Import datasets

In [22]:
data =  pd.read_csv("../data_pretraitee/data_pretraite.csv", sep=';', index_col = 0, parse_dates=['Date'])
meteo = pd.read_excel("../data/meteo_2017_2022.xlsx")

In [23]:
meteo_18_21=meteo[(meteo["Année"]!=2017) & (meteo["Année"]!=2022)]

In [24]:
data.columns

Index(['N2O', 'Date', 'Bande', 'soilH_h1', 'soilH_h2', 'NO3_h1', 'NO3_h2',
       'NH4_h1', 'NH4_h2', 'meanTP', 'air_humidity', 'pluie',
       'Travail_sol_J60', 'Semis_J90', 'CP', 'Peuplement', 'fertilisation',
       'nbJours_ferti', 'Ajout_phyto_30J', 'Ajout_phyto_45J',
       'Ajout_phyto_60J', 'Ajout_ferti_30J', 'Ajout_ferti_45J',
       'Ajout_ferti_60J', 'Recolte', 'mean_soilH_h1_J5', 'mean_soilH_h2_J5',
       'mean_NH4_h1_J5', 'mean_NH4_h2_J5', 'mean_NO3_h1_J5', 'mean_NO3_h2_J5',
       'N2O_J182', 'Humidite_max_5J', 'Humidite_min_5J', 'Humidite_moyenne_5J',
       'Temp_max_5J', 'Temp_min_5J', 'Temp_moyenne_5J', 'Sum_ETP_5J',
       'Sum_pluie_5J', 'acs', 'pn', 'bi', 'bio', 'elev', 'ref', 'week',
       'month', 'year', '2018', '2019', '2020', '2021', 'Pic_J182'],
      dtype='object')

In [25]:

data2 = data[["N2O", "Date", "Bande", "CP", "Peuplement"
            , "Travail_sol_J60", 'soilH_h1', 'soilH_h2'
            , "Ajout_phyto_30J", 'Ajout_phyto_45J', 'Ajout_phyto_60J'
            , "Ajout_ferti_30J", 'Ajout_ferti_45J' , 'Ajout_ferti_60J'
            , "N2O_J182", "Temp_moyenne_5J"
            , "Sum_pluie_5J", "year", "month"]]

### Define Class Categorizer

In [26]:
class categorizer :
    """provide functions to transform a quantitative column to a qualitative column"""

    def __init__(self, df_cible: object) -> None:
        self.df = df_cible

    def by_quantile(self, df_quantile: object, col_quantile : str, col_cible : str, name_col : str, factor : int = 1) -> object: 
        """
        df_quantile : Dataframe we will used for quantile
        col_quantile : The column that will be used to estimate the quantiles [0.25, 0.75]
        factor : Possibility to multiply the quantiles by a factor (usefull if we compare a daily value to a weekly value)
        col_cible : The column that wil be categorized according to the quantiles found
        Name_col : Name of the new column
        """
        quantiles = df_quantile[col_quantile].quantile([0.25, 0.75])
        self.df[name_col] = np.select(
                            [self.df[col_cible] >= (quantiles[0.75] * factor)
                            , self.df[col_cible] <= (quantiles[0.25] * factor)
                            , (self.df[col_cible] > (quantiles[0.25] * factor)) & (self.df[col_cible] < (quantiles[0.75] * factor))]
                            , ["elevee","basse","moyenne"])
        return (self.df)

    def by_values(self, col_cible : str, values_int : list, values_str : list, name_col : str) -> object :
        """
        col_cible : The column that wil be categorized according to the values_int
        values_int : List of int used to separate the col_cible
        values_str : Names of the categories
        Name_col : Name of the new column
        """
        for i in range(len(values_int)):
            # print(values_int[i - 1], values_int[i], values_str[i])
            self.df.loc[(self.df[col_cible] > values_int[i - 1])
                        & (self.df[col_cible] <= values_int[i])
                        , name_col] = values_str[i]
        return self.df

    def compare_cols(self, df_source : object, col_cible : str, col_source : str, col_name : str) -> object :
        """
        df_source : Dataframe use for comparing
        col_cible : Name of the column on df_source
        col_source : Name of the column we are looking to categorized
        col_name : Name of the new column
        """
        self.df[col_name] = self.df[col_cible] > df_source[col_source]
        return self.df

### Data Transformation

In [27]:
cat = categorizer(data2.copy())

data2 = cat.by_quantile(meteo_18_21, "Temp moyenne (°C)", "Temp_moyenne_5J", "Temp_quali")
data2 = cat.by_quantile(meteo_18_21, "Pluie (mm)", "Sum_pluie_5J", "Pluie_quali", 5)
data2 = cat.by_values("Sum_pluie_5J", [1, data2.Sum_pluie_5J.max(),0], ['pas_pluie', 'pluie', 'pas_pluie'], 'Pluie_ON')
data2["day"] = data2.Date.dt.day_of_year
data2 = cat.by_values("day", [80, 172, 264, 355, 0], ['hiver', 'printemps', 'ete', 'automne','hiver'], "Saison")
data2.pop("day")
data2 = cat.compare_cols(data2, "N2O", "N2O_J182", "Pic_J182")

data2 = cat.by_quantile(data2, "soilH_h1", "soilH_h1", "soil_h1_quali", 1)
data2 = cat.by_quantile(data2, "soilH_h2", "soilH_h2", "soil_h2_quali", 1)


In [28]:
# Categorisation des pics selon N20. Si pas pics = not_pic
data3 = data2[['N2O']][data2['Pic_J182'] != False]
cat2 = categorizer(data3.copy())
data3 = cat2.by_quantile(data3, "N2O", "N2O", "N2O_quali", 1)
data2 = data2.join(data3['N2O_quali'])
data2[['N2O_quali']] = data2[['N2O_quali']].fillna('not_pic')

In [29]:
data2

Unnamed: 0,N2O,Date,Bande,CP,Peuplement,Travail_sol_J60,soilH_h1,soilH_h2,Ajout_phyto_30J,Ajout_phyto_45J,...,year,month,Temp_quali,Pluie_quali,Pluie_ON,Saison,Pic_J182,soil_h1_quali,soil_h2_quali,N2O_quali
0,18.537129,2018-03-14,acs,ble,ble,False,0.214182,0.197413,0.0,0.0,...,2018,3,moyenne,elevee,pluie,hiver,True,elevee,moyenne,elevee
1,2.088998,2018-03-14,pn,ble,ble,False,0.214570,0.224213,0.0,0.0,...,2018,3,moyenne,elevee,pluie,hiver,False,elevee,elevee,not_pic
2,1.488817,2018-03-14,bi,ble,ble,False,0.221340,0.222435,0.0,0.0,...,2018,3,moyenne,elevee,pluie,hiver,False,elevee,elevee,not_pic
3,1.390654,2018-03-14,bio,ble,ble,False,0.212602,0.214897,0.0,0.0,...,2018,3,moyenne,elevee,pluie,hiver,False,elevee,elevee,not_pic
4,3.297613,2018-03-14,elev,ble,ble,False,0.210646,0.202263,0.0,0.0,...,2018,3,moyenne,elevee,pluie,hiver,False,elevee,moyenne,not_pic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,2.389670,2021-07-07,ref,ble,ble,False,0.124132,0.156738,0.0,1.0,...,2021,7,elevee,elevee,pluie,ete,True,basse,moyenne,basse
369,0.163105,2019-08-12,elev,colza,sol_nu,True,0.134634,0.119821,0.0,0.0,...,2019,8,elevee,elevee,pluie,ete,False,moyenne,moyenne,not_pic
370,0.538784,2019-08-12,ref,colza,sol_nu,True,0.157191,0.130231,0.0,0.0,...,2019,8,elevee,elevee,pluie,ete,False,moyenne,moyenne,not_pic
371,10.192294,2019-09-05,elev,colza,colza,True,0.089014,0.144502,1.0,1.0,...,2019,9,moyenne,basse,,ete,True,basse,moyenne,moyenne


### Export data

In [30]:
data2.to_csv("../data_pretraitee/data_categorielle.csv", sep=";")