In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

#train_file_path = '/Users/come/Downloads/X_train_Hi5.csv'  #replace by your path
#test_file_path = '/Users/come/Downloads/X_test_Hi5.csv'    #replace by your path

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

selected_columns = [
    # Hydro
    'piezo_station_investigation_depth', 'piezo_station_altitude',
    'hydro_observation_result_elab', 'piezo_obtention_mode',
    # Meteo
    'meteo_rain_height', 'meteo_temperature_avg', 'meteo_temperature_min',
    'meteo_temperature_max', 'meteo_humidity_avg', 'meteo_sunshine_%',
    'meteo_pressure_avg',
    # Geo
    'piezo_station_longitude', 'piezo_station_latitude', 'distance_piezo_meteo',
    # Social and economics
    'insee_%_agri', 'insee_pop_commune',
    # prelev
    'prelev_volume_0', 'prelev_volume_1', 'prelev_volume_2',
    'prelev_usage_label_0', 'prelev_usage_label_1', 'prelev_usage_label_2'
]

# Target
target_column = 'piezo_groundwater_level_category'

# Step 2 : prepare data
X_train = train_data[selected_columns]
y_train = train_data[target_column]

# Save index
X_test = test_data[selected_columns]
test_index = test_data['row_index']

# Categorical data encoding
cat_cols = X_train.select_dtypes(include=['object']).columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))

    unknown_values = set(X_test[col].unique()) - set(le.classes_)
    le.classes_ = list(le.classes_) + list(unknown_values)

    X_test[col] = le.transform(X_test[col].astype(str))
    encoders[col] = le

# Imputation of missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardise data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Encode target variable
target_encoder = LabelEncoder()
y_train_encoded = target_encoder.fit_transform(y_train)

# Step 3 : Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train_encoded)

# Étape 4 : Predict on base test
y_test_pred_encoded = clf.predict(X_test_scaled)
y_test_pred = target_encoder.inverse_transform(y_test_pred_encoded)

# Save data
submission = pd.DataFrame({
    'row_index': test_index,
    'piezo_groundwater_level_category': y_test_pred
})

#output_file_path = '/Users/come/Downloads/predictions_corrected.csv'  # replace by your path
submission.to_csv(output_file_path, index=False)

print(f"Les prédictions ont été sauvegardées dans le fichier : {output_file_path}")