# EU crop map - Benchmarking on the indices for the crops classes
## 1) Split by polygones - accuracy per pixels and per polygone
## 2) Split by pixels - accuracy per pixels
### Set up

In [1]:
# JEODPP
data_path='/eos/jeodpp/data/projects/REFOCUS/data/S1_GS/all-10days/Map_v7/'
project_path='/eos/jeodpp/data/projects/REFOCUS/classification/'
path_pol = '/eos/jeodpp/data/projects/REFOCUS/data/polygons/v7'
results='/eos/jeodpp/data/projects/REFOCUS/classification/'

local='/eos/jeodpp/home/users/verheas/data/LUCAS/v7/'

#working directory
pwd = project_path

# !pip install matplotlib --user
# !pip install sklearn --user
#import 
import pandas as pd
from pandas import Series,DataFrame
import csv
import numpy as np
import time
import sklearn
import scipy
import matplotlib.pyplot as plt
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import glob
import os

### Load the data

In [2]:
## Load the data
#1) load the S1 10 days extracted values in GEE for all polygons

pd_lucas= pd.read_csv(os.path.join(data_path,'S1_point_allV7_10days_10m_1Jan-31Dec_EU_ratio-db.csv'),dtype={'level_1':int,'level_2':int})
print('pd_lucas',pd_lucas.shape)

#concatenate all the data in one dataframe
#group cropland, grassland and bareland 
#number of pixels per class
print(pd_lucas.level_1.value_counts())
print(pd_lucas.level_2.value_counts())
pd_lucas.head()

#number of pixels per class
#pd_lucas.LC1_COD.value_counts()
#pd_lucas.head()
pd_lucas.columns

##############1.2 Load the shapefile with the polygons - useful to split the polygons in training and test dataset for the accuracy ######################
# load csv with of the polygons
#2)load csv with the polygons for the split test/validation
lucas_polygons = pd.read_csv(os.path.join(path_pol,'LUCAS_2018_Copernicus_attributes_cropmap_level1-2.csv'))
lucas_polygons.head()

pd_lucas (2956889, 116)
300    1216530
200    1000318
500     732964
600       3856
100       3221
Name: level_1, dtype: int64
300    1216530
500     732964
211     290116
213     142886
216     125644
232      64452
290      63609
250      59053
214      35215
231      34577
240      34067
215      34005
212      28825
222      23174
218      19274
221      15815
230      12000
233       7094
223       4920
219       4724
600       3856
100       3221
217        868
Name: level_2, dtype: int64


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,POINT_ID,NUTS0,NUTS1,NUTS2,NUTS3,TH_LAT,TH_LONG,OFFICE_PI,EX_ANTE,...,LU2_LABEL,LU1_TYPE_LABEL,LU2_TYPE_LABEL,CPRN_LC_LABEL,CPRN_LC_SAME_LC1,LUCAS_CORE_INTERSECT,COPERNICUS_CLEANED,stratum,level_2,level_1
0,0,34562080,ES,ES2,ES24,ES243,41.288386,-0.319428,0,0,...,Not relevant,,,Other bare soil,True,True,True,2,290,200
1,1,31243520,IE,IE0,IE04,IE042,53.422977,-8.226052,0,0,...,Not relevant,,,Spontaneously re-vegetated surfaces,True,True,True,1,500,500
2,2,33661774,ES,ES5,ES52,ES521,38.434388,-0.905705,0,0,...,Not relevant,,,Permanent crops: fruit trees,True,True,True,2,300,300
3,3,28922250,ES,ES1,ES11,ES113,41.867145,-7.320304,0,0,...,Not relevant,,,Shrubland with sparse tree cover,True,True,True,2,300,300
4,4,35082906,FR,FRD,FRD1,FRD12,48.71519,-1.09219,0,0,...,Not relevant,,,Grassland without tree/shrub cover,True,True,True,1,500,500


### The parameters

In [3]:
##################################Parameters##################################################
#classes - stored in a table 'legend-lucas-all'
table_class=pd.read_csv(os.path.join(project_path,'table/legend-lucas-all-v7.csv'),dtype=pd.Int64Dtype())

classes_L1=list(table_class['classes_L1'].dropna()) 
classes_L2=list(table_class['classes_L2'].dropna())

#level
level_1='level_1'
level_2='level_2'

##################################Parameters##################################################
#classes - stored in a table 'legend-lucas-all'
table_class=pd.read_csv(os.path.join(project_path,'table/legend-lucas-all-v2.csv'),dtype=pd.Int64Dtype())

classes_L1=list(table_class['classes_L1'].dropna())
classes_L2=list(table_class['classes_L2'].dropna())

#remap classes and selection of classes to map Level 1
classes_in_L1 =  list(table_class['classes_all'].dropna())

classes_in_L2 = list(table_class['classes_all'].dropna()),

#classes affected by biome selection
classes_L1_B= list(table_class['classes_L1_B'].dropna())
classes_L2_B= list(table_class['classes_L2'].dropna())

#Classes non affected by biome selection
#Classes from (A) Artificial, (F) Bare lands and (H) Wetlands can be considered in each models - no biome dependent
classes_L1_NB=list(table_class['classes_L1_NB'].dropna())
classes_L2_NB=[]
#summary of the classes used in the classification
classes_classif_L1= list(table_class['L1_BIOME'].dropna())
classes_classif_L1_simplify=list(table_class['L1_B_harmon'].dropna())

#[100,200,520,300,400,600,800]
classes_classif_L2=list(table_class['L2_BIOME'].dropna())
classes_classif_L2_simplify=list(table_class['L2_B_harmon'].dropna())

###################################Choose parameters for this run #############################################
#classes for the classification and biome/no biome differentiation if needed
classes_B=classes_L2_B
print ('classes_B',classes_B)

classes_NB=classes_L2_NB
print ('classes_NB',classes_NB)

#level
level=level_2
print('level',level)
#crop - level 2, from the table we load only the crop type classes
classes=classes_L2
print('level',classes)

###################################Labels of the classes #############################################
labels_csv = pd.read_csv(os.path.join(project_path,'table/legend-lucas2.csv'))
labels=labels_csv[labels_csv['class'].isin(classes)] # select only the used labels
labels_s=labels_csv[labels_csv['class'].isin(classes)] # select only the used labels
print(classes)
print(labels)
print(labels_s)

classes_B [211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 230, 231, 232, 233, 240, 250, 290]
classes_NB []
level level_2
level [211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 230, 231, 232, 233, 240, 250, 290]
[211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 230, 231, 232, 233, 240, 250, 290]
    class                                 label
36    230  other non permanent industrial crops
37    240     dry pulse, vegetables and flowers
38    250                    other fodder crops
39    290                      bare arable land
40    211                          common wheat
41    212                           durum wheat
42    213                                barley
43    214                                   rye
44    215                                  oats
45    216                                 maize
46    217                                  rice
47    218                             triticale
48    219                         other cere

### Prepare the data

In [4]:
##############################################################
#### 2) Prepare the data for the classification ##############
##############################################################

#############2.1 Select level of work and classes
#copy values in a new column 'Classif' that we will use in the rest of the script
pd_lucas['Classif']=pd_lucas[level]
print(pd_lucas.shape)

#add the biome after the class
pd_lucas_biome=pd_lucas[pd_lucas.Classif.isin(classes_B)]
pd_lucas_nobiome=pd_lucas[pd_lucas.Classif.isin(classes_NB)]

pd_lucas_biome['ClassifB']=pd_lucas_biome['Classif'].astype(str) + pd_lucas_biome['stratum'].astype(str)
pd_lucas_nobiome['ClassifB']=pd_lucas_nobiome['Classif'].astype(str) + '0'

pd_lucas_b=pd_lucas_biome.append(pd_lucas_nobiome)

#legend level 1 - create new column and copy values
#pd_level1['ClassL1B']=pd_level1[['LC1_COD', 'BIOME_N']].apply(lambda x: ''.join(x.map(str)), axis=1)
#pd_level1['ClassL1B']=pd_level1['ClassL1'].astype(str) + pd_level1['BIOME_N'].astype(str)
#print(pd_lucas_b.head())
print(pd_lucas_b.ClassifB.value_counts())

#############2.2 Prepare the dataframe with the pixels extraction

lucas_polygons['Classif']=lucas_polygons.level_2

#reclassify
#lucas_polygons.Classif=lucas_polygons.Classif.replace(classes_in,
#                                                        classes_remap)
#print(lucas_polygons.shape)
#print(lucas_polygons.Classif.value_counts())

#select the classes of interest for Level 1
#add the biome after the class

lucas_polygons_biome=lucas_polygons[lucas_polygons.Classif.isin(classes_B)]
lucas_polygons_nobiome=lucas_polygons[lucas_polygons.Classif.isin(classes_NB)]

print(lucas_polygons_biome.shape)
print(lucas_polygons_biome.Classif.value_counts())
print(lucas_polygons_nobiome.shape)
print(lucas_polygons_nobiome.Classif.value_counts())

lucas_polygons_biome['ClassifB']=lucas_polygons_biome['Classif'].astype(str) + lucas_polygons_biome['stratum'].astype(str)
lucas_polygons_nobiome['ClassifB']=lucas_polygons_nobiome['Classif'].astype(str) + '0'
print(lucas_polygons_biome.ClassifB.value_counts())


(2956889, 117)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


2111    261290
2161    119697
2131     98670
2321     63899
2132     44216
2501     40453
2901     35167
2141     31956
2112     28826
2902     28442
2151     25021
2311     24369
2221     22706
2401     22440
2502     18600
2122     17962
2181     17686
2211     14965
2402     11627
2121     10863
2312     10208
2152      8984
2301      7600
2331      7020
2162      5947
2191      4625
2302      4400
2231      4036
2142      3259
2182      1588
2232       884
2212       850
2171       661
2322       553
2222       468
2172       207
2192        99
2332        74
Name: ClassifB, dtype: int64
(18376, 114)
211    4829
213    2545
216    2383
250    1332
290    1312
232    1111
240     709
231     684
215     642
214     603
212     586
222     403
218     329
221     308
230     247
233     155
223      94
219      90
217      14
Name: Classif, dtype: int64
(0, 114)
Series([], Name: Classif, dtype: int64)
2111    4210
2161    2242
2131    1615
2321    1096
2132     930
2501     860
2902 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Parameters of the benchmarking on the indices

In [5]:
parameters = pd.read_csv( os.path.join(project_path,'table/RF-parameters-table-INDICE_v2.csv'))
parameters['name']=parameters['NAME_indice']+parameters['NAME_date']
parameters['regex']=parameters['REGEX_indice']+parameters['REGEX_time']
print (parameters)

  NAME_indice   NAME_date                                       REGEX_indice  \
0          VV  MONTH[1-7]                                   ((?<![\w\d])VV_)   
1          VH  MONTH[1-7]                                   ((?<![\w\d])VH_)   
2        VHVV  MONTH[1-7]                                 ((?<![\w\d])VHVV_)   
3       VV-VH  MONTH[1-7]                (((?<![\w\d])VH_)|((?<![\w\d])VV_))   
4  VV-VH-VHVV  MONTH[1-7]  (((?<![\w\d])VV_)|((?<![\w\d])VH_)|((?<![\w\d]...   
5     VV-VHVV  MONTH[1-7]              (((?<![\w\d])VV_)|((?<![\w\d])VHVV_))   
6     VH-VHVV  MONTH[1-7]              (((?<![\w\d])VH_)|((?<![\w\d])VHVV_))   

     REGEX_time     TEST  month                  name  \
0  (20180[1-7])  indices    NaN          VVMONTH[1-7]   
1  (20180[1-7])  indices    NaN          VHMONTH[1-7]   
2  (20180[1-7])  indices    NaN        VHVVMONTH[1-7]   
3  (20180[1-7])  indices    NaN       VV-VHMONTH[1-7]   
4  (20180[1-7])  indices    NaN  VV-VH-VHVVMONTH[1-7]   
5  (20180[1-7])  

### A) Split on polygons

In [None]:
manip='INDICES-BIOME-STRATIFY-CROP_pol'
if not os.path.exists(os.path.join('result',manip)):
    os.mkdir(os.path.join('result',manip))

In [None]:
#Option when the biomes are separated and put back together
for i_test in range(0,len(parameters['name'])):
    print('processing : '+manip+'  ' +parameters['name'][i_test])
    #lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2332']
    #lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2194']
    #lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2234']
    
    #subset by biomes and create another loop for the 4 biomes
    #execute the split/train
    #join the results and calculate the OA
    y_test_s_all=pd.Series([])
    y_test_pred_s_all=pd.Series([])
    y_test_s_all_pol=pd.Series([])
    y_test_pred_s_all_pol=pd.Series([])
    
    for biome in range(1,3):
        print(biome)
        # 1 / create a text file for log recording
        file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_1_Processing_Log.txt'),"w") 

        file.write('Processing summary \n') 
        file.write("Date and time start: "+ datetime.now().strftime("%d/%m/%Y %H:%M:%S")+"\n") 
        file.write("Classes : "+ str(classes)+"\n") 
        file.write("Regex : "+ str(parameters['regex'][i_test])+"\n") 
        file.write("Name : "+ str(parameters['name'][i_test])+"\n") 
    
        #select biome on the polygons
        lucas_polygons_biome_b=lucas_polygons_biome[lucas_polygons_biome.stratum.isin([biome])]
        lucas_polygons_b=lucas_polygons_biome_b.append(lucas_polygons_nobiome)
        #drop 2143 as there is only one
        #lucas_polygons_b = lucas_polygons_b[lucas_polygons_b.ClassifB != 2143]

        #print('dataframe complet',lucas_polygons_b.shape)
        #variety of classes per pixels for the selected biome
        #print('dataframe complet',pd.value_counts(lucas_polygons_b.Classif,sort=True))
        #print('dataframe complet',lucas_polygons_b.head())
        print(lucas_polygons_b.Classif.value_counts())

        # Subset the polygons
        X_featuresP=lucas_polygons_b.filter(items=['POINT_ID','Classif'])
        y_classP=lucas_polygons_b['Classif']#.astype(np.float32)
        file.write("Input DB polygons shape  : "+ str(X_featuresP.shape)+"\n") 
        file.write("Input DB polygons columns  : "+ str(list(X_featuresP.columns))+"\n") 
    
        # 1/ Split between test and train
        #TO BE DONE ON THE LUCAS POLYGONS
        #https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
        X_trainP,X_testP,y_trainP,y_testP  = train_test_split(X_featuresP,y_classP, test_size=0.2,random_state=5,stratify=y_classP)
        file.write("X_trainP.shape  : "+ str(X_trainP.shape)+"\n") 
        file.write("X_testP.shape  : "+ str(X_testP.shape)+"\n")
        file.write("y_trainP.shape  : "+ str(y_trainP.shape)+"\n")
        file.write("y_testP.shape  : "+ str(y_testP.shape)+"\n")

        # 2/select the pixels from the polygons
        #and Subset the DB with regex
        indexPOINItrain=pd_lucas_b['POINT_ID'].isin(X_trainP['POINT_ID'])
        indexPOINItest=pd_lucas_b['POINT_ID'].isin(X_testP['POINT_ID'])
        
        X_train=pd_lucas_b[indexPOINItrain].filter(regex=parameters['regex'][i_test])
        y_train=pd_lucas_b[indexPOINItrain]['Classif']
        X_test=pd_lucas_b[indexPOINItest].filter(regex=parameters['regex'][i_test])
        y_test=pd_lucas_b[indexPOINItest]['Classif']
        
        #write
        file.write("Input DB X_train pixels shape  : "+ str(X_train.shape)+"\n") 
        file.write("Input DB X_train pixels columns  : "+ str(list(X_train.columns))+"\n") 
        
        #keep all info to aggregate prediction per polygons
        y_train_pol=pd_lucas_b[indexPOINItrain]
        y_test_pol=pd_lucas_b[indexPOINItest]

       
        # 4/ Save the class distribution for training and testing as CSV
        #x = pd.DataFrame(y_train.value_counts().rename_axis('class').reset_index(name='counts'))
        x = pd.DataFrame({"count_pol": y_train_pol.groupby('POINT_ID').apply(max)['Classif'].value_counts(), "count_pixel": y_train.value_counts()}).rename_axis('class')
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_2_Training_class_count_polygons.csv'))
        #x = pd.DataFrame(y_test.value_counts().rename_axis('class').reset_index(name='counts'))
        x = pd.DataFrame({"count_pol": y_test_pol.groupby('POINT_ID').apply(max)['Classif'].value_counts(), "count_pixel": y_test.value_counts()}).rename_axis('class')
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_3_Testing_class_count_polygons.csv'))


        # 5/ Fit the RANDOM PARAMETERS T
        t = time.time()    
        clf = RandomForestClassifier(bootstrap=0, criterion='gini', max_depth=None, max_features='auto', 
                                     min_samples_leaf=12, min_samples_split=3, n_estimators=800, n_jobs=40)
                                                                                                                                                                                    
        clf.fit(X_train, y_train)

        training_time=time.time() - t
        file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_1_Processing_Log.txt'),"a") 
        file.write("Elapsed time for training  : "+ str(round(training_time))+" sec \n")
        #file.write("Model  : " +str(clf)+"\n")
        file.close()

        # 6/ Feature importances as  CSV
        x = list(zip(clf.feature_importances_,X_train.columns))
        x = pd.DataFrame(x,columns=["Importance","Feature_Name"])
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_2_Feature_importance.csv') )
        
        #append the test value in a file for the 4 biomes
        # 7/ OA -evaluate accuracy with the test dataset for the unique rf model
        #reclassify the classes by biomes to the simple classes 
        #transform to series to use replace

        #Prediction
        y_test_pred=clf.predict(X_test)      
        y_test_s=pd.Series(y_test, dtype='float')
        #y_test_s=y_test_s.replace(classes_classif,classes_classif_simplify)
        
        y_test_pred_s=pd.Series(y_test_pred, dtype='float')
        #y_test_pred_s=y_test_pred_s.replace(classes_classif,classes_classif_simplify)
                
        #to calculate accuracy, go back to array    
        accuracy = 100.0*(y_test_s.array == y_test_pred_s.array).sum()/y_test_s.shape[0]
        print('Accuracy is :' + str(round(accuracy,2)))
    
        #del(file)
        file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_1_1_Accuracy.txt'),"w") 
        #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
        file1.write(str(accuracy)+"\n") 
        file1.close()
        
        # 8/ Classification report
        report = classification_report(y_test_s, y_test_pred_s, output_dict=True)
        df = pd.DataFrame(report).transpose()
        df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_3_classification_report.csv') )
        
        # 9 / Extract confusion matrix to CSV - to fix - labels not correct
        confusion_mat=confusion_matrix(y_test_s,y_test_pred_s,labels=classes)
        confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
        confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_4_confusion_matrix_class.csv'))
        
        #accuracy mode polygon
        #apply a majority rule (mode)
        #group it with y_test
        y_test_results=pd.DataFrame({'POINT_ID':y_test_pol['POINT_ID'],'ref':y_test,'predict':y_test_pred})
        y_test_results=y_test_results.groupby(['POINT_ID'])['predict','ref'].agg(lambda x: x.mode()[0])
        
        #to calculate accuracy, go back to array    
        accuracy_pol = 100.0*(y_test_results['ref'].array == y_test_results['predict'].array).sum()/y_test_results.shape[0]
        print('Accuracy is :' + str(round(accuracy_pol,2)))
        
        #del(file)
        file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_1_1_Accuracy_pol.txt'),"w") 
        #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
        file1.write(str(accuracy_pol)+"\n") 
        file1.close()
        # 8/ Classification report
        report = classification_report(y_test_results['ref'],y_test_results['predict'], output_dict=True)
        df = pd.DataFrame(report).transpose()
        df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_3_classification_report_pol.csv') )
        
        # 9 / Extract confusion matrix to CSV - to fix - labels not correct
        confusion_mat=confusion_matrix(y_test_results['ref'],y_test_results['predict'],labels=classes)
        confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
        confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_4_confusion_matrix_class_pol.csv'))
        
        y_test_s_all=y_test_s_all.append(y_test_s)      
        #print(y_test_all)        
        y_test_pred_s_all=y_test_pred_s_all.append(y_test_pred_s)
        
        #pol
        y_test_s_all_pol=y_test_s_all_pol.append(y_test_results['ref'])      
        #print(y_test_all)        
        y_test_pred_s_all_pol=y_test_pred_s_all_pol.append(y_test_results['predict'])
        
    #to calculate accuracy, go back to array    
    accuracy = 100.0*(y_test_s_all.array == y_test_pred_s_all.array).sum()/y_test_s_all.shape[0]
    print('Accuracy is :' + str(round(accuracy,2)))
   
    #del(file)
    file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_1_1_Accuracy.txt'),"w") 
    #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
    file1.write(str(accuracy)+"\n") 
    file1.close()
    
    # 8/ Classification report
    report = classification_report(y_test_s_all, y_test_pred_s_all, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_3_classification_report.csv') )

    # 9 / Extract confusion matrix to CSV - to fix - labels not correct
    confusion_mat=confusion_matrix(y_test_s_all,y_test_pred_s_all,labels=classes)
    confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
    confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_4_confusion_matrix_class.csv'))
    #confusion_mat_class=pd.DataFrame(confusion_mat,index= list(labels_s['class']),columns=list(labels_s['class']))
    #confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_remap_4_confusion_matrix_class.csv') )
    #confusion_mat_label=pd.DataFrame(confusion_mat,index= list(labels_s['label']),columns=list(labels_s['label']))
    #confusion_mat_label.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_remap_4_confusion_matrix_label.csv') )

    #to calculate accuracy, go back to array    
    accuracy_pol = 100.0*(y_test_s_all_pol.array == y_test_pred_s_all_pol.array).sum()/y_test_s_all_pol.shape[0]
    print('Accuracy is :' + str(round(accuracy_pol,2)))
    
    #del(file)
    file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_1_1_Accuracy_pol.txt'),"w") 
    #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
    file1.write(str(accuracy_pol)+"\n") 
    file1.close()
    
    # 8/ Classification report
    report = classification_report(y_test_s_all_pol, y_test_pred_s_all_pol, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_3_classification_report_pol.csv') )

    # 9 / Extract confusion matrix to CSV - to fix - labels not correct
    confusion_mat=confusion_matrix(y_test_s_all_pol,y_test_pred_s_all_pol,labels=classes)
    confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
    confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_4_confusion_matrix_class_pol.csv'))
    
    file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_1_1_Processing_Log.txt'),"a") 
    file.write("Date and time end: "+ datetime.now().strftime("%d/%m/%Y %H:%M:%S")+"\n") 
    file.close()
    
    del(df,clf,confusion_mat)#confusion_mat_label,confusion_mat)

### B) Split on pixels

In [None]:
manip='INDICES-BIOME-STRATIFY-CROP_pix'
if not os.path.exists(os.path.join('result',manip)):
    os.mkdir(os.path.join('result',manip))

In [None]:
#Option when the biomes are separated and put back together
for i_test in range(0,len(parameters['name'])):
    print('processing : '+manip+'  ' +parameters['name'][i_test])
    lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2332']
    #lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2194']
    #lucas_polygons_biome=lucas_polygons_biome[lucas_polygons_biome.ClassifB!='2234']
    
    #subset by biomes and create another loop for the 4 biomes
    #execute the split/train
    #join the results and calculate the OA
    y_test_s_all=pd.Series([])
    y_test_pred_s_all=pd.Series([])

    
    for biome in range(1,3):
        print(biome)
        # 1 / create a text file for log recording
        file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_1_Processing_Log.txt'),"w") 

        file.write('Processing summary \n') 
        file.write("Date and time start: "+ datetime.now().strftime("%d/%m/%Y %H:%M:%S")+"\n") 
        file.write("Classes : "+ str(classes)+"\n") 
        file.write("Regex : "+ str(parameters['regex'][i_test])+"\n") 
        file.write("Name : "+ str(parameters['name'][i_test])+"\n") 
    
        #select biome on the polygons
        lucas_polygons_biome_b=lucas_polygons_biome[lucas_polygons_biome.stratum.isin([biome])]
        lucas_polygons_b=lucas_polygons_biome_b.append(lucas_polygons_nobiome)
        #drop 2143 as there is only one
        #lucas_polygons_b = lucas_polygons_b[lucas_polygons_b.ClassifB != 2143]

        #print('dataframe complet',lucas_polygons_b.shape)
        #variety of classes per pixels for the selected biome
        #print('dataframe complet',pd.value_counts(lucas_polygons_b.Classif,sort=True))
        #print('dataframe complet',lucas_polygons_b.head())
        print(lucas_polygons_b.Classif.value_counts())

        # Subset the polygons
        X_features=pd_lucas_b.filter(regex=parameters['regex'][i_test])
        y_class=pd_lucas_b['Classif']#.astype(np.float32)
        file.write("Input DB pixel shape  : "+ str(X_features.shape)+"\n") 
        file.write("Input DB pixel columns  : "+ str(list(X_features.columns))+"\n") 
    
        # 1/ Split between test and train
        #TO BE DONE ON THE LUCAS POLYGONS
        #https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
        X_train,X_test,y_train,y_test  = train_test_split(X_features,y_class, test_size=0.2,random_state=5,stratify=y_class)
        
        file.write("X_train.shape  : "+ str(X_train.shape)+"\n") 
        file.write("X_test.shape  : "+ str(X_test.shape)+"\n")
        file.write("y_train.shape  : "+ str(y_train.shape)+"\n")
        file.write("y_test.shape  : "+ str(y_test.shape)+"\n")

        # 2/select the pixels from the polygons
        #and Subset the DB with regex
             
        #write
        file.write("Input DB X_train pixels shape  : "+ str(X_train.shape)+"\n") 
        file.write("Input DB X_train pixels columns  : "+ str(list(X_train.columns))+"\n") 
        
      
        # 4/ Save the class distribution for training and testing as CSV
        #x = pd.DataFrame(y_train.value_counts().rename_axis('class').reset_index(name='counts'))
        x = pd.DataFrame({"count_pixel": y_train.value_counts()}).rename_axis('class')
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_2_Training_class_count_pixels.csv'))
        #x = pd.DataFrame(y_test.value_counts().rename_axis('class').reset_index(name='counts'))
        x = pd.DataFrame({"count_pixel": y_test.value_counts()}).rename_axis('class')
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_3_Testing_class_count_pixels.csv'))


        # 5/ Fit the RANDOM PARAMETERS T
        t = time.time()    
        clf = RandomForestClassifier(bootstrap=0, criterion='gini', max_depth=None, max_features='auto', 
                                     min_samples_leaf=12, min_samples_split=3, n_estimators=800, n_jobs=40)
                                                                                                                                                                                    
        clf.fit(X_train, y_train)

        training_time=time.time() - t
        file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_1_1_Processing_Log.txt'),"a") 
        file.write("Elapsed time for training  : "+ str(round(training_time))+" sec \n")
        #file.write("Model  : " +str(clf)+"\n")
        file.close()

        # 6/ Feature importances as  CSV
        x = list(zip(clf.feature_importances_,X_train.columns))
        x = pd.DataFrame(x,columns=["Importance","Feature_Name"])
        x.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_'+str(biome)+'_remap_2_Feature_importance.csv') )
        
        #append the test value in a file for the 4 biomes
        # 7/ OA -evaluate accuracy with the test dataset for the unique rf model
        #reclassify the classes by biomes to the simple classes 
        #transform to series to use replace

        #Prediction
        y_test_pred=clf.predict(X_test)      
        y_test_s=pd.Series(y_test, dtype='float')
        #y_test_s=y_test_s.replace(classes_classif,classes_classif_simplify)
        
        y_test_pred_s=pd.Series(y_test_pred, dtype='float')
        #y_test_pred_s=y_test_pred_s.replace(classes_classif,classes_classif_simplify)
                
        #to calculate accuracy, go back to array    
        accuracy = 100.0*(y_test_s.array == y_test_pred_s.array).sum()/y_test_s.shape[0]
        print('Accuracy is :' + str(round(accuracy,2)))
    
        #del(file)
        file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_1_1_Accuracy.txt'),"w") 
        #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
        file1.write(str(accuracy)+"\n") 
        file1.close()
        
        # 8/ Classification report
        report = classification_report(y_test_s, y_test_pred_s, output_dict=True)
        df = pd.DataFrame(report).transpose()
        df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_3_classification_report.csv') )
        
        # 9 / Extract confusion matrix to CSV - to fix - labels not correct
        confusion_mat=confusion_matrix(y_test_s,y_test_pred_s,labels=classes)
        confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
        confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'biome'+str(biome)+'_4_confusion_matrix_class.csv'))
        
        
        y_test_s_all=y_test_s_all.append(y_test_s)      
        #print(y_test_all)        
        y_test_pred_s_all=y_test_pred_s_all.append(y_test_pred_s)
    
        
    #to calculate accuracy, go back to array    
    accuracy = 100.0*(y_test_s_all.array == y_test_pred_s_all.array).sum()/y_test_s_all.shape[0]
    print('Accuracy is :' + str(round(accuracy,2)))
   
    #del(file)
    file1 = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_1_1_Accuracy.txt'),"w") 
    #file.write("Accuracy of the classifier  : " +str(round(accuracy,2))+" % "+" \n")
    file1.write(str(accuracy)+"\n") 
    file1.close()
    
    # 8/ Classification report
    report = classification_report(y_test_s_all, y_test_pred_s_all, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_3_classification_report.csv') )

    # 9 / Extract confusion matrix to CSV - to fix - labels not correct
    confusion_mat=confusion_matrix(y_test_s_all,y_test_pred_s_all,labels=classes)
    confusion_mat_class=pd.DataFrame(confusion_mat,index=classes,columns=classes)
    confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_4_confusion_matrix_class.csv'))
    #confusion_mat_class=pd.DataFrame(confusion_mat,index= list(labels_s['class']),columns=list(labels_s['class']))
    #confusion_mat_class.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_remap_4_confusion_matrix_class.csv') )
    #confusion_mat_label=pd.DataFrame(confusion_mat,index= list(labels_s['label']),columns=list(labels_s['label']))
    #confusion_mat_label.to_csv(os.path.join(local,'result',manip,parameters['name'][i_test]+'_remap_4_confusion_matrix_label.csv') )

    file = open(os.path.join(local,'result',manip,parameters['name'][i_test]+'_regroup_remap_1_1_Processing_Log.txt'),"a") 
    file.write("Date and time end: "+ datetime.now().strftime("%d/%m/%Y %H:%M:%S")+"\n") 
    file.close()
    
    del(df,clf,confusion_mat)#confusion_mat_label,confusion_mat)