In [2]:
#Librairies
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
#Configuration
srcFolder = "../data/raw/tempo_202510201126/"
srcFolderMeteo = "../data/raw/data.gouv/"
dstFolder = "../data/processed/tempo_202510201126/"

sources = [
    "agroclim_pheno.csv",
    "epiphyt.csv",
    "foret.csv",
    "ods_recherche.csv",
    "ods_tela_botanica.csv",
    "phenoclim_agroclim_inrae.csv",
    "phenoclim_crea_mont_blanc.csv",
    "phetec_inrae.csv",
    "variables_communes.csv"
]
colonnes = [ 
    'annee', 
    'jour_de_l_annee', 
    'latitude_du_site', 
    'longitude_du_site',
    'regne', 
    'genre',
    'espece',
    'code_du_stade_phenologique',
    'date'
]

In [4]:
#Chargement des données

#Récupération et combinaison des sources
dataFrames = []

for source in sources:
    print(f"Chargement source {source}...")
    
    data = pd.read_csv(
        srcFolder+source,
        encoding='ISO-8859-1', 
        sep=';',
        low_memory=False,
        usecols=colonnes
    )
    print(f"{data.shape[0]} lignes chargées !")
    dataFrames.append(data)
    print("----------------------------------------------")

allData = pd.concat(dataFrames, ignore_index=True)
print(f"Nombre total de résultats : {allData.shape[0]}")
allData['date'] = pd.to_datetime(allData['date'])
allData = allData[allData['date'] >= '1950-01-01']
print(f"Nombre de résultats > 1950 : {allData.shape[0]}")
allData.head(10)

Chargement source agroclim_pheno.csv...
7635 lignes chargées !
----------------------------------------------
Chargement source epiphyt.csv...
255818 lignes chargées !
----------------------------------------------
Chargement source foret.csv...
48070 lignes chargées !
----------------------------------------------
Chargement source ods_recherche.csv...
16090 lignes chargées !
----------------------------------------------
Chargement source ods_tela_botanica.csv...
11613 lignes chargées !
----------------------------------------------
Chargement source phenoclim_agroclim_inrae.csv...
8026 lignes chargées !
----------------------------------------------
Chargement source phenoclim_crea_mont_blanc.csv...
13075 lignes chargées !
----------------------------------------------
Chargement source phetec_inrae.csv...
126 lignes chargées !
----------------------------------------------
Chargement source variables_communes.csv...
360453 lignes chargées !
-----------------------------------------

Unnamed: 0,date,annee,jour_de_l_annee,latitude_du_site,longitude_du_site,regne,genre,espece,code_du_stade_phenologique
13,1950-03-23,1950,82,48.8,2.08,Plantae,Acer,platanoides,61
14,1950-04-14,1950,104,48.8,2.08,Plantae,Acer,platanoides,65
15,1951-04-03,1951,93,48.8,2.08,Plantae,Acer,platanoides,61
1801,1965-04-13,1965,103,48.8,2.08,Plantae,Acer,platanoides,65
1802,1966-04-06,1966,96,48.8,2.08,Plantae,Acer,platanoides,61
1875,1952-04-10,1952,101,48.8,2.08,Plantae,Acer,platanoides,61
1876,1953-03-30,1953,89,48.8,2.08,Plantae,Acer,platanoides,61
1877,1955-04-12,1955,102,48.8,2.08,Plantae,Acer,platanoides,61
1878,1956-04-12,1956,103,48.8,2.08,Plantae,Acer,platanoides,61
1879,1957-03-23,1957,82,48.8,2.08,Plantae,Acer,platanoides,61


In [5]:
#On garde uniquement les lignes qui nous intéresse
allData = allData[(allData["date"] > '1950-01-01') & (allData["regne"] == "Plantae") & (allData["genre"] == "Prunus") & (allData["espece"] == "avium")]
allData = allData.reset_index()

print(f"Corpus : {allData.shape[0]}")


Corpus : 5624


In [6]:
#On conserve uniquement les colonnes qui nous intéressent
allData = allData.drop(["genre", "regne", "espece", "code_du_stade_phenologique"], axis=1)
allData

Unnamed: 0,index,date,annee,jour_de_l_annee,latitude_du_site,longitude_du_site
0,11808,2016-04-13,2016,104,43.857086,4.634030
1,11810,2016-04-20,2016,111,43.857086,4.634030
2,11871,2015-04-14,2015,104,43.666380,4.399529
3,11872,2015-04-20,2015,110,43.902252,4.562464
4,11873,2015-04-20,2015,110,43.902252,4.562464
...,...,...,...,...,...,...
5619,715638,2021-04-19,2021,109,44.188300,5.172060
5620,715639,2021-04-19,2021,109,44.188300,5.172060
5621,715640,2021-04-19,2021,109,44.188300,5.172060
5622,715641,2021-04-19,2021,109,44.188300,5.172060


In [7]:
#On ajoute la colonne date int
allData['date_int'] = allData['date'].astype(str).str.replace('-', '').astype(int)

In [8]:
allData

Unnamed: 0,index,date,annee,jour_de_l_annee,latitude_du_site,longitude_du_site,date_int
0,11808,2016-04-13,2016,104,43.857086,4.634030,20160413
1,11810,2016-04-20,2016,111,43.857086,4.634030,20160420
2,11871,2015-04-14,2015,104,43.666380,4.399529,20150414
3,11872,2015-04-20,2015,110,43.902252,4.562464,20150420
4,11873,2015-04-20,2015,110,43.902252,4.562464,20150420
...,...,...,...,...,...,...,...
5619,715638,2021-04-19,2021,109,44.188300,5.172060,20210419
5620,715639,2021-04-19,2021,109,44.188300,5.172060,20210419
5621,715640,2021-04-19,2021,109,44.188300,5.172060,20210419
5622,715641,2021-04-19,2021,109,44.188300,5.172060,20210419


In [9]:
#Chargement des données météo

dfMeteo = pd.read_parquet(f"{srcFolderMeteo}meteo.parquet", engine="fastparquet")

In [10]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calcule la distance en kilomètres entre deux points géographiques
    en utilisant la formule de Haversine
    """
    # Rayon de la Terre en km
    R = 6371.0
    
    # Conversion en radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    # Différences
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Formule de Haversine
    a = np.sin(dlat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    
    return distance

In [11]:

def jour_de_annee(date_str):
    """
    Calcule le numéro du jour dans l'année
    date_str: chaîne au format "AAAAMMJJ"
    retourne: numéro du jour (1-366)
    """
    date = datetime.strptime(date_str, "%Y%m%d")
    return date.timetuple().tm_yday

In [21]:


def create_lines(row):
    dateStart = int(row["annee"])*10000+101
    dateEnd = int(row["date_int"])
    latA = row["latitude_du_site"]
    longA = row["longitude_du_site"]
    test = dfMeteo[(dfMeteo["AAAAMMJJ"] <= dateEnd) & (dfMeteo["AAAAMMJJ"] >= dateStart)].sort_values("AAAAMMJJ").copy()

    #On calcule la distance
    test['distance'] = haversine_distance(latA, longA, test['LAT'], test['LON'])

    #On recherche le point le plus proche
    points_plus_proches = dfMeteo.loc[test.groupby('AAAAMMJJ')['distance'].idxmin()]

    #On calcule la T moye
    points_plus_proches['T_moy'] = (points_plus_proches['TX'] + points_plus_proches['TN']) / 2

    T_base = 5
    points_plus_proches['GDD5'] = points_plus_proches['T_moy'].apply(lambda x: max(0, x - T_base))
    points_plus_proches['GDD5_cumul'] = points_plus_proches['GDD5'].cumsum()

    T_base = 10
    points_plus_proches['GDD10'] = points_plus_proches['T_moy'].apply(lambda x: max(0, x - T_base))
    points_plus_proches['GDD10_cumul'] = points_plus_proches['GDD10'].cumsum()

    points_plus_proches['jour_n'] = range(1, len(points_plus_proches) + 1)
    points_plus_proches

    points_plus_proches = points_plus_proches.reset_index(drop=True)

    col_latitude = []
    col_longitude = []
    col_n_avant_floraison = []
    col_jour_n = []
    col_temps_thermique5 = []
    col_temps_thermique10 = []
    col_date = []

    jour_floraison = len(points_plus_proches)+1

    jour_n_theorique = jour_de_annee(str(dateEnd))
    print(jour_n_theorique)

    for index, row2 in points_plus_proches.iterrows():
        col_latitude.append(latA)
        col_longitude.append(longA)
        col_n_avant_floraison.append(jour_floraison-index-2)
        col_jour_n.append(index+1)
        col_temps_thermique5.append(row2['GDD5_cumul']),
        col_temps_thermique10.append(row2['GDD10_cumul'])
        col_date.append(row2["AAAAMMJJ"])

    dfAdd = pd.DataFrame({
        'latitude' : col_latitude,
        'longitude' : col_longitude,
        'n_avant_floraison' : col_n_avant_floraison,
        'jour_n' : col_jour_n,
        'temps_thermique5' : col_temps_thermique5,
        'temps_thermique10' : col_temps_thermique10,
        'date' : col_date
    })

    return dfAdd


In [27]:
allDf = []

for index, row in allData.iterrows():
    print(index)
    allDf.append(create_lines(row))


0
104
1
111
2
104
3
110
4
110
5
104
6
104
7
111
8
118
9
118
10
97
11
105
12
105
13
112
14
111
15
111
16
105
17
105
18
112
19
97
20
97
21
81
22
87
23
87
24
103
25
117
26
117
27
110
28
103
29
117
30
123
31
130
32
100
33
86
34
116
35
103
36
86
37
86
38
118
39
109
40
116
41
93
42
97
43
95
44
84
45
109
46
109
47
109
48
110
49
95
50
102
51
102
52
101
53
88
54
88
55
95
56
95
57
95
58
102
59
102
60
93
61
93
62
100
63
93
64
99
65
107
66
117
67
101
68
114
69
107
70
85
71
85
72
93
73
107
74
107
75
85
76
100
77
114
78
114
79
104
80
110
81
111
82
99
83
107
84
107
85
107
86
117
87
101
88
107
89
103
90
86
91
102
92
102
93
116
94
105
95
102
96
93
97
97
98
103
99
86
100
86
101
100
102
102
103
95
104
95
105
95
106
102
107
81
108
81
109
110
110
85
111
107
112
99
113
121
114
108
115
107
116
86
117
100
118
86
119
124
120
102
121
102
122
109
123
116
124
116
125
86
126
103
127
95
128
102
129
102
130
88
131
95
132
95
133
95
134
102
135
71
136
113
137
101
138
102
139
86
140
103
141
102
142
116
143
95
144
102
1

In [28]:
dfFinal = pd.concat(allDf, ignore_index=True)

In [29]:
dfFinal

Unnamed: 0,latitude,longitude,n_avant_floraison,jour_n,temps_thermique5,temps_thermique10,date
0,43.857086,4.63403,103,1,5.70,0.7,20160101.0
1,43.857086,4.63403,102,2,12.20,2.2,20160102.0
2,43.857086,4.63403,101,3,13.70,2.2,20160103.0
3,43.857086,4.63403,100,4,17.30,2.2,20160104.0
4,43.857086,4.63403,99,5,21.20,2.2,20160105.0
...,...,...,...,...,...,...,...
553131,44.188300,5.17206,4,105,347.95,55.4,20210415.0
553132,44.188300,5.17206,3,106,350.05,55.4,20210416.0
553133,44.188300,5.17206,2,107,353.45,55.4,20210417.0
553134,44.188300,5.17206,1,108,356.25,55.4,20210418.0


In [31]:
#Sauvegarde finale 
dfFinal.to_parquet(f"{dstFolder}cerisiers_processed.parquet", engine='fastparquet')