In [None]:
# Import necessary libraries for visualization and data processing
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
df = pd.read_csv("/weather.csv")  # Update with the correct dataset path

# Data cleaning functions
def data_preparation_0(df):
    for i in df.columns:
        l = i.split("_")
        if len(l) != 1:
            if len(l) > 2:
                i1 = l[1] + l[2]
            else:
                i1 = l[1]
            i1 += "_" + l[0]
            df.rename(columns={i: i1}, inplace=True)
    return df

def data_preparation_1(df):
    df["id"] = df.index
    df_long = pd.wide_to_long(df.reset_index(), stubnames=[
        'precipitation', 'tempmean', 'tempmin',
        'tempmax', 'windspeed', 'pressure'], i=['DATE'], j='town', sep='_', suffix='.+')
    df_long.reset_index()
    df3 = df_long[['precipitation', 'tempmean', 'tempmin',
                   'tempmax', 'windspeed', 'pressure']]
    df3.reset_index(inplace=True)
    return df3

def extract_date_features(df):
    df['DATE'] = pd.to_datetime(df['DATE'], format='%Y%m%d')
    df['year'] = df['DATE'].dt.year
    df['month'] = df['DATE'].dt.month
    df['day'] = df['DATE'].dt.day
    df = df.drop(columns='DATE')
    return df

def day_in_Life(df, number):
    for i in range(1, number + 1):
        df[[f"tempmean{i}", f"tempmax{i}"]] = df.groupby(['town'])[
            ["tempmean", "tempmax"]].shift(i)
    return df

# Apply the data cleaning functions
df = data_preparation_0(df)
df = data_preparation_1(df)
df = extract_date_features(df)
df = day_in_Life(df, 2)

# Define the preprocessing pipeline with scaling and PCA
num_selector = make_column_selector(dtype_include=np.number)
num_tree_processor = make_pipeline(
    SimpleImputer(strategy="mean", add_indicator=True),
    StandardScaler(),  # Scaling step
    PCA()
)
tree_preprocessor = make_column_transformer((num_tree_processor, num_selector))

# Define the pipeline with the RandomForestRegressor
pipe = Pipeline([
    ('preprocessor', tree_preprocessor),
    ('reg', RandomForestRegressor(random_state=42))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'preprocessor__pipeline__pca__n_components': [4, 6, 8],  # Test different PCA components
    'reg__n_estimators': [50, 100, 150],                   # Number of trees in the forest
    'reg__max_depth': [None, 10, 20, 30],                  # Maximum depth of the tree
    'reg__min_samples_split': [2, 5, 10],                  # Minimum number of samples to split a node
    'reg__min_samples_leaf': [1, 2, 4]                     # Minimum number of samples at a leaf node
}

# Set up the GridSearchCV with 3-fold cross-validation
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='r2')

# Prepare data for training
x = df[['precipitation', 'windspeed', 'pressure', 'year', 'month', 'day']]
y = df['tempmean']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fit the model using GridSearchCV
grid.fit(x_train, y_train)

# Print the best parameters and the best score from GridSearchCV
print("Meilleur score R2:", grid.best_score_)
print("Meilleurs paramètres:", grid.best_params_)

# Evaluate the model on the test set
print("Score sur le jeu de test:", grid.score(x_test, y_test))


Fitting 3 folds for each of 324 candidates, totalling 972 fits
