In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

In [3]:
df1 = pd.read_csv('/content/drive/MyDrive/data_file_2022-07-10.csv')
df1.head(10000)

Unnamed: 0,time,temperature,relative_humidity,chocho_conc,co_conc,dust,ecres_conc,ectot_conc,hcho_conc,nh3_conc,...,pm10_conc,pm2p5_conc,pmwf_conc,sia_conc,so2_conc,average_pollen_concentration,average_pollen_c_,average_pollen_c_1,longitude,latitude
0,2022-07-10 12:00:00,273.42755,82.519790,0.000140,124.72867,0.000227,6.219987e-21,0.000001,0.142590,0.000014,...,2.968837,1.426107,4.103644e-21,0.071435,0.785839,0.000165,0.000165,0.000165,-0.037491,76.462498
1,2022-07-10 12:00:00,273.49930,82.249300,0.000140,124.72867,0.000227,6.219987e-21,0.000001,0.142590,0.000014,...,2.968837,1.426107,4.103644e-21,0.071435,0.785839,0.000165,0.000165,0.000165,-0.037491,76.399998
2,2022-07-10 12:00:00,273.48758,83.577900,0.000140,124.72867,0.000227,6.219987e-21,0.000001,0.142590,0.000014,...,2.968837,1.426107,4.103644e-21,0.071435,0.785839,0.000165,0.000165,0.000165,-0.037491,76.337498
3,2022-07-10 12:00:00,273.45100,86.068040,0.000140,124.72867,0.000227,6.219987e-21,0.000001,0.142590,0.000014,...,2.968837,1.426107,4.103644e-21,0.071435,0.785839,0.000165,0.000165,0.000165,-0.037491,76.274998
4,2022-07-10 12:00:00,273.44513,87.661830,0.000140,124.72867,0.000227,6.219987e-21,0.000001,0.142590,0.000014,...,2.968837,1.426107,4.103644e-21,0.071435,0.785839,0.000165,0.000165,0.000165,-0.037491,76.212498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2022-07-10 12:00:00,315.58582,7.972221,0.000853,93.46439,18.237415,5.403456e-04,0.011862,0.300023,0.000000,...,19.668943,9.058665,1.110223e-06,1.944046,0.769887,0.086477,0.086500,0.086500,2.250000,27.724999
9996,2022-07-10 12:00:00,315.39987,8.290443,0.000853,93.46439,18.237415,5.403456e-04,0.011862,0.300023,0.000000,...,19.668943,9.058665,1.110223e-06,1.944046,0.769887,0.086477,0.086500,0.086500,2.250000,27.662499
9997,2022-07-10 12:00:00,314.95038,8.332878,0.000853,93.46439,18.237415,5.403456e-04,0.011862,0.300023,0.000000,...,19.668943,9.058665,1.110223e-06,1.944046,0.769887,0.086477,0.086500,0.086500,2.250000,27.599999
9998,2022-07-10 12:00:00,314.30470,8.887123,0.000853,93.46439,18.237415,5.403456e-04,0.011862,0.300023,0.000000,...,19.668943,9.058665,1.110223e-06,1.944046,0.769887,0.086477,0.086500,0.086500,2.250000,27.537499


In [None]:
# Prétraitement des données et division en ensembles d'entraînement et de test

X = df1[['temperature', 'relative_humidity', 'chocho_conc', 'co_conc', 'dust', 'ecres_conc', 'ectot_conc', 'hcho_conc', 'nh3_conc', 'nmvoc_conc', 'no2_conc', 'no_conc', 'o3_conc', 'pans_conc', 'pm10_conc', 'pm2p5_conc', 'pmwf_conc', 'sia_conc', 'so2_conc']]
y = df1['average_pollen_concentration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Définir la grille de paramètres
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
}

# Initialiser le modèle
model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Initialiser la recherche sur grille
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Ajuster la recherche sur grille aux données d'entraînement
grid_search.fit(X_train, y_train)

# Obtenir les meilleurs paramètres
best_params = grid_search.best_params_
print(f"Meilleurs paramètres : {best_params}")

# Entraîner le modèle avec les meilleurs paramètres
model = RandomForestRegressor(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R²: {r2}')
print(f'average_pollen_concentration: {y_pred}')