In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import src.baseline_model02 as bm

# Second model computation

Loading of the data path, *nb_train_minicube*, *nb_test_minicube* and *nb_val_minicube*, are used to limit the number of "mini data cube" used in train/test/val and therefore reduce the computational cost of the trainning and hyper parameters exploration. When *compute_full_test_set* is activated the full test set is created.

In [None]:
baseline_model_generator_test = bm.BaseLineModel(
    "localdata/smallbox/label/label_",
    dynamic_features_path = "localdata/Model1_score_ERA5_Rez_v2.nc",
    static_features_root_path = "localdata/smallbox/static/static_",
    dynamic_features_FR_path = "localdata/Model1_Score_Full_Rez_v2.nc",
    inf_dynamic_features_FR_path = "localdata/Model1_Score_Full_Rez_inf.nc",
    static_features_FR_path = "localdata/static_Full_Rez.nc",
    labels_ERA5_path = "localdata/final_label_Full_ERA5.nc",
    labels_FR_path = "localdata/final_label_Full_Rez.nc",
    nb_train_minicube = 80, #Those values are very small for good performance you will need more datacubes
    nb_test_minicube = 80, #Those values are very small for apropriate test you will need more datacubes
    nb_val_minicube = 20,
    min_score_model1 = 0.2,
    name="Baseline_Model_2_Small_20_02",
    seed=1
    )

### Preparation of the train / test / val dataset

This process is quite long, the vectorised train / test / val can be saved to gain time when training several models on the same datas

In [None]:
baseline_model_generator_test.prepare_data()

The vectorised Full train test (all France data on the define time slices for test train might be quite long to process).
Furthermore the *Full test set*, by nature, is fixed, so we process the vectorised *Full Test Set* independently.

### Training of a model 

Trainning a Random Forest with all features, 150 trees and depth 8.

In [None]:
baseline_model_generator_test.load_indiv([True, #soilgrid_bdod
                                          True, #soilgrid_cfvo
                                          True, #soilgrid_silt
                                          True, #soilgrid_clay
                                          True, #soilgrid_sand
                                          True, #depth_to_bedrock
                                          True, #altitude
                                          True, #aspect
                                          True, #slope
                                          True, #water_density
                                          True, #watershed
                                          True, #topological_catchment_areas
                                          True, #dist_sea
                                          True, #dist_riv
                                          True, #M1_score
                                          150, 
                                          8], 
                                     False)

### Saving and loading model

Vectorised test/train/validation dataset and trainned models are saved (the Full test saved is saved independently)

In [None]:
baseline_model_generator_test.save_to_disk()

### Loading of previously saved models / vectorised dataset 

In [None]:

baseline_model_generator_test = baseline_model_generator_test.load_from_disk("Baseline_Model_2_Small_20_02")

### Hyper parameters search

Using Genetic Algorithms for hyper parameters optimisation.

In [None]:
# baseline_model_generator_test.GA_optimisation(ngen = 40, pop = 60)


# Model Analysis

### Feature importance

In [None]:
baseline_model_generator_test.print_feature_importance()

### Pre-processing of the full test data

Loading of the Full Test Dataset from disk.
This process is quite slow when you have done it one time you d'ont need to do it again as long as you don't change your first model outputs.

In [None]:
baseline_model_generator_test.prepare_data(compute_full_test_set=True) #This will take a while, only do it one time
baseline_model_generator_test.save_full_test_to_disk(name="Full") #Saving the results to disk
baseline_model_generator_test.load_full_test_from_disk(name="Full") #Loading the results from disk, start from here if you already computed the full test set

### Geographical results

##### Prediction score Map

##### False Positive, True Positive, False Negative Mapping

In [None]:
baseline_model_generator_test.load_FullRez()

In [None]:
baseline_model_generator_test.print_TNTPFN(save_path="graph/Model2/TNTPFN/", thresholdM1=0.5, thresholdM2=0.5)

In [None]:
baseline_model_generator_test.print_proba(save_path="graph/Model2/Proba/", thresholdM1=0.5, thresholdM2=0.5)

### AUC Graphs

In [None]:
baseline_model_generator_test.auc_graph("Full_Test", "", [0.01,0.05,0.1,0.15, 0.2,0.3, 0.5, 0.9])

In [None]:
baseline_model_generator_test.process_AUC_metrics(filter=False)

In [None]:
baseline_model_generator_test.process_prediction_metrics(filter=False)

# Computation of predictions for codabench

#### Data loading

In [None]:
baseline_model_generator_test.load_InfRez()


#### Printing of the prediction map

In [None]:
baseline_model_generator_test.print_proba_inf(save_path="graph/Model2/inference/")

In [None]:
baseline_model_generator_test.save_full_pred()

Loading of the previously computed predictions

In [None]:
predictions = xr.open_dataset("localdata/Model2_Score_Full_Rez_inf.nc")

Conversion to vector

In [None]:
def from_xarray_to_vector(data: xr.DataArray):
    xry = data.M2_score.values
    vectors = xry.reshape(xry.shape[0], xry.shape[1]*xry.shape[2])
    vector = vectors.flatten()
    final_label_Full_Rez = xr.open_dataset("localdata/final_label_Full_Rez.nc")
    ws = final_label_Full_Rez.sel(time="2002-08-04")["__xarray_dataarray_variable__"].values
    mask = np.invert(np.isnan(np.where(ws == -1, np.nan, ws)))
    mask = np.repeat(mask, xry.shape[0])
    return vector[mask].astype('float16')

In [None]:
out = from_xarray_to_vector(predictions)

In [None]:
pd.DataFrame(out).to_csv("localdata/pred.csv")
shutil.make_archive("localdata/pred.csv", 'zip', "localdata","pred.csv")