# LUCAS COPERNICUS
# Creation of the random forest models for the Eurocropmap
## crop types

In [1]:
# JEODPP
data_path='/eos/jeodpp/data/projects/REFOCUS/data/S1_GS/all-10days/Map_v7/'
project_path='/eos/jeodpp/data/projects/REFOCUS/classification/'
path_pol = '/eos/jeodpp/data/projects/REFOCUS/data/polygons/v7'
results='/eos/jeodpp/data/projects/REFOCUS/classification/'

local='/eos/jeodpp/home/users/verheas/data/LUCAS/v7/'

# RAF LOCAL
# data_path='/data/LUCAS-cop-single-pixel'
# project_path='/data/Dropbox/JRC/LANDSENSE/CASE-STUDY-8-LUCAS-COPERNICUS-CLASSIF/'

In [2]:
#import 
import pandas as pd
from pandas import Series,DataFrame
#import geopandas as gdp
import csv
import numpy as np
import time
import sklearn
import scipy
import matplotlib.pyplot as plt
import glob
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
#pd.show_versions()

## The parameters

In [3]:
##################################Parameters##################################################
#classes - stored in a table 'legend-lucas-all'
table_class=pd.read_csv(os.path.join(project_path,'table/legend-lucas-all-v7.csv'),dtype=pd.Int64Dtype())

classes_L1=list(table_class['classes_L1'].dropna()) 
classes_L2=list(table_class['classes_L2'].dropna())

#Biome selection

biome_1=[1]
biome_2=[2]
biome_3=[3]
biome_4=[4]

#level
level_1='level_1'
level_2='level_2'

## Load the data

In [4]:
## Load the data
#1) load the S1 10 days extracted values in GEE for all polygons

pd_lucas= pd.read_csv(os.path.join(data_path,'S1_point_allV7_10days_10m_1Jan-31Dec_EU_ratio-db.csv'),dtype={'level_1':int,'level_2':int})
print('pd_lucas',pd_lucas.shape)

#concatenate all the data in one dataframe
#group cropland, grassland and bareland 
#number of pixels per class
print(pd_lucas.level_1.value_counts())
print(pd_lucas.level_2.value_counts())
pd_lucas.head()

#number of pixels per class
#pd_lucas.LC1_COD.value_counts()
#pd_lucas.head()
pd_lucas.columns

##############1.2 Load the shapefile with the polygons - useful to split the polygons in training and test dataset for the accuracy ######################
# load csv with of the polygons
#2)load csv with the polygons for the split test/validation
lucas_polygons = pd.read_csv(os.path.join(path_pol,'LUCAS_2018_Copernicus_attributes_cropmap_level1-2.csv'))
lucas_polygons.head()

pd_lucas (2956889, 116)
300    1216530
200    1000318
500     732964
600       3856
100       3221
Name: level_1, dtype: int64
300    1216530
500     732964
211     290116
213     142886
216     125644
232      64452
290      63609
250      59053
214      35215
231      34577
240      34067
215      34005
212      28825
222      23174
218      19274
221      15815
230      12000
233       7094
223       4920
219       4724
600       3856
100       3221
217        868
Name: level_2, dtype: int64


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,POINT_ID,NUTS0,NUTS1,NUTS2,NUTS3,TH_LAT,TH_LONG,OFFICE_PI,EX_ANTE,...,LU2_LABEL,LU1_TYPE_LABEL,LU2_TYPE_LABEL,CPRN_LC_LABEL,CPRN_LC_SAME_LC1,LUCAS_CORE_INTERSECT,COPERNICUS_CLEANED,stratum,level_2,level_1
0,0,34562080,ES,ES2,ES24,ES243,41.288386,-0.319428,0,0,...,Not relevant,,,Other bare soil,True,True,True,2,290,200
1,1,31243520,IE,IE0,IE04,IE042,53.422977,-8.226052,0,0,...,Not relevant,,,Spontaneously re-vegetated surfaces,True,True,True,1,500,500
2,2,33661774,ES,ES5,ES52,ES521,38.434388,-0.905705,0,0,...,Not relevant,,,Permanent crops: fruit trees,True,True,True,2,300,300
3,3,28922250,ES,ES1,ES11,ES113,41.867145,-7.320304,0,0,...,Not relevant,,,Shrubland with sparse tree cover,True,True,True,2,300,300
4,4,35082906,FR,FRD,FRD1,FRD12,48.71519,-1.09219,0,0,...,Not relevant,,,Grassland without tree/shrub cover,True,True,True,1,500,500


In [5]:
#print('biome',pd_lucas.stratum.value_counts())
#print(pd_lucas.groupby('POINT_ID').apply(min).shape)
#print(pd_lucas.groupby('POINT_ID').apply(min).stratum.value_counts())

# Biome 1 - parameters

In [6]:
###################################Choose parameters for this run #############################################
#Biome
biome=biome_1
print('biome',biome)

#level
level=level_2
print('level',level)
#crop - level 2, from the table we load only the crop type classes
classes=classes_L2
print('level',classes)

#Split for the train/test dataset - we run it with all the polygons
#split_test = 0

biome [1]
level level_2
level [211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 230, 231, 232, 233, 240, 250, 290]


## Biome 1 - Prepare the data

In [7]:
##############################################################
#### 2) Prepare the data for the classification ##############
##############################################################

#############2.1 Select level of work and classes
#copy values in a new column 'Classif' that we will use in the rest of the script
pd_lucas['Classif']=pd_lucas[level]
print(pd_lucas.shape)

pd_lucas_i=pd_lucas[pd_lucas.Classif.isin(classes_L2)]

#############2.2 Select the biome
#select biome
pd_lucas_b=pd_lucas_i[pd_lucas_i.stratum.isin(biome)]
print('pd_lucas_b',pd_lucas_b.Classif.value_counts())
print('pd_lucas_b',pd_lucas.Classif.value_counts())
print(pd_lucas_b.groupby('POINT_ID').apply(min).stratum.value_counts())

#############2.3 Select the data inputs for the classification
## we use all the polygons, therefore there are no test dataset

X_train=pd_lucas_b.filter(regex='(((?<![\w\d])VH_)|((?<![\w\d])VV_))(20180[1-7])')
y_train=pd_lucas_b['Classif']

print('X_train head',X_train.head())
print('X_train shape',X_train.shape)

print('y_train shape',y_train.shape)
print('y_train count',y_train.value_counts())

(2956889, 117)
pd_lucas_b 211    261290
216    119697
213     98670
232     63899
250     40453
290     35167
214     31956
215     25021
231     24369
222     22706
240     22440
218     17686
221     14965
212     10863
230      7600
233      7020
219      4625
223      4036
217       661
Name: Classif, dtype: int64
pd_lucas_b 300    1216530
500     732964
211     290116
213     142886
216     125644
232      64452
290      63609
250      59053
214      35215
231      34577
240      34067
215      34005
212      28825
222      23174
218      19274
221      15815
230      12000
233       7094
223       4920
219       4724
600       3856
100       3221
217        868
Name: Classif, dtype: int64
1    14138
Name: stratum, dtype: int64
X_train head       VH_20180101  VH_20180111  VH_20180121  VH_20180131  VH_20180210  \
3221   -19.922432   -19.665386   -21.799988   -21.300838   -21.750020   
3222   -19.120277   -18.656569   -21.332876   -20.977938   -21.134995   
3223   -19.106567   -18.0

In [8]:
print('biome',pd_lucas_b.stratum.value_counts())
#print(pd_lucas.groupby('POINT_ID').apply(sum).shape)
print(pd_lucas_b.groupby('POINT_ID').apply(min).stratum.value_counts())

biome 1    813124
Name: stratum, dtype: int64
1    14138
Name: stratum, dtype: int64


In [12]:
rfc = RandomForestClassifier(bootstrap=True,criterion='gini', max_depth=None, max_features='sqrt', 
                             n_estimators=1200, n_jobs=40,min_samples_split=2,min_samples_leaf=1)
rfc.fit(X_train.values, y_train.values)

pickle.dump(rfc, open(os.path.join('RFmodel_LUCAS_'+str(biome)+'_'+str(level)+'_all-polygons_janv-jul2018_15122020bdesk'), 'wb'))

## Biome 1 - Run a RandomizedSearchCV

In [10]:
#to many samples

In [None]:
#Grid Search for a set of parameters - without scaler - no need of pipeline?
#there is also the randomized search CV that will not run all the possibilities - we can use to get a first idea
t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

param_grid = { 
    'RFclf__n_estimators': [300,500,800,900,1000,1100,1200], #The number of trees in the forest.
    'RFclf__max_features': ['sqrt', 'log2'], #The number of features to consider when looking for the best split:
    'RFclf__max_depth': [5,10,15,25,50,None],#The maximum depth of the tree
    'RFclf__min_samples_leaf': [1,2,3,4,8,10,12],#The minimum number of samples in newly created leaves.
    #'RFclf__bootstrap': [0, 1],#Whether bootstrap samples are used when building trees.
    'RFclf__min_samples_split': [1,10,25,50,70], #The minimum number of samples required to split an internal node
    'RFclf__criterion': ['gini', 'entropy']#The function to measure the quality of a split
}
#Scaler = preprocessing.StandardScaler()
#Scaler.fit(TrF)
Pipeline = Pipeline([('RFclf', RandomForestClassifier())])
#rfc = GridSearchCV(estimator=Pipeline, param_grid=param_grid,cv=3, n_jobs=100, verbose=1)
rfc = RandomizedSearchCV(estimator=Pipeline, param_distributions =param_grid, n_iter=42,cv=3, n_jobs=42, verbose=1)
#RandomizedSearchCV
#rfc.fit(Scaler.transform(TrF), TrC)
rfc.fit(X_train, y_train)
print('Elapsed time for training: %.02f sec' % (time.time() - t))


pickle.dump(rfc, open(os.path.join(project_path,'RFmodel_LUCAS_'+str(biome)+'_'+str(level)+'_all-polygons_janv-jul2018_01012020best2'), 'wb'))

print(rfc.best_estimator_.named_steps['RFclf'])

#best model
#print('best estimator',best_model)

#pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level1-mask_all-polygons_janv-jul2018_160920-best', 'wb'))

In [None]:
#Grid Search for a set of parameters - without scaler - no need of pipeline?
#there is also the randomized search CV that will not run all the possibilities - we can use to get a first idea
t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

param_grid = { 
    'RFclf__n_estimators': [300,500,800,900,1000,1100,1200], #The number of trees in the forest.
    'RFclf__max_features': ['sqrt', 'log2'], #The number of features to consider when looking for the best split:
    #'RFclf__max_depth': [5,10,15,25,50,None],#The maximum depth of the tree
    #'RFclf__min_samples_leaf': [1,2,3,4,8,10,12],#The minimum number of samples in newly created leaves.
    #'RFclf__bootstrap': [0, 1],#Whether bootstrap samples are used when building trees.
    #'RFclf__min_samples_split': [1,10,25,50,70], #The minimum number of samples required to split an internal node
    'RFclf__criterion': ['gini', 'entropy']#The function to measure the quality of a split
}
#Scaler = preprocessing.StandardScaler()
#Scaler.fit(TrF)
Pipeline = Pipeline([('RFclf', RandomForestClassifier())])
#rfc = GridSearchCV(estimator=Pipeline, param_grid=param_grid,cv=3, n_jobs=100, verbose=1)
rfc = RandomizedSearchCV(estimator=Pipeline, param_distributions =param_grid, n_iter=42,cv=3, verbose=1)
#RandomizedSearchCV
#rfc.fit(Scaler.transform(TrF), TrC)
rfc.fit(X_train, y_train)
print('Elapsed time for training: %.02f sec' % (time.time() - t))


pickle.dump(rfc, open(os.path.join(project_path,'RFmodel_LUCAS_'+str(biome)+'_'+str(level)+'_all-polygons_janv-jul2018_15122020best-smallgrid'), 'wb'))

print(rfc.best_estimator_.named_steps['RFclf'])

#best model
#print('best estimator',best_model)

#pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level1-mask_all-polygons_janv-jul2018_160920-best', 'wb'))

Method: Random Forest
Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [12]:
pickle.dump(rfc, open(os.path.join('RFmodel_LUCAS_'+str(biome)+'_'+str(level)+'_all-polygons_janv-jul2018_15122020best-smallgridter'), 'wb'))


# Biome 2 - choose parameters

In [7]:
###################################Choose parameters for this run #############################################
#Biome
biome=biome_2
print('biome',biome)

#level
level=level_2
print('level',level)
#crop - level 2, from the table we load only the crop type classes
classes=classes_L2
print('level',classes)

#Split for the train/test dataset - we run it with all the polygons
#split_test = 0

biome [2]
level level_2
level [211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 223, 230, 231, 232, 233, 240, 250, 290]


## Biome 2 - prepare the data

In [8]:
##############################################################
#### 2) Prepare the data for the classification ##############
##############################################################

#############2.1 Select level of work and classes
#copy values in a new column 'Classif' that we will use in the rest of the script
pd_lucas['Classif']=pd_lucas[level]
print(pd_lucas.shape)

pd_lucas_i=pd_lucas[pd_lucas.Classif.isin(classes_L2)]

#############2.2 Select the biome
#select biome
pd_lucas_b=pd_lucas_i[pd_lucas_i.stratum.isin(biome)]
print('pd_lucas_b',pd_lucas_b.Classif.value_counts())
print('pd_lucas_b',pd_lucas.Classif.value_counts())
print(pd_lucas_b.groupby('POINT_ID').apply(min).stratum.value_counts())

#############2.3 Select the data inputs for the classification
## we use all the polygons, therefore there are no test dataset

X_train=pd_lucas_b.filter(regex='(((?<![\w\d])VH_)|((?<![\w\d])VV_))(20180[1-7])')
y_train=pd_lucas_b['Classif']

print('X_train head',X_train.head())
print('X_train shape',X_train.shape)

print('y_train shape',y_train.shape)
print('y_train count',y_train.value_counts())

(2956889, 117)
pd_lucas_b 213    44216
211    28826
290    28442
250    18600
212    17962
240    11627
231    10208
215     8984
216     5947
230     4400
214     3259
218     1588
223      884
221      850
232      553
222      468
217      207
219       99
233       74
Name: Classif, dtype: int64
pd_lucas_b 300    1216530
500     732964
211     290116
213     142886
216     125644
232      64452
290      63609
250      59053
214      35215
231      34577
240      34067
215      34005
212      28825
222      23174
218      19274
221      15815
230      12000
233       7094
223       4920
219       4724
600       3856
100       3221
217        868
Name: Classif, dtype: int64
2    4233
Name: stratum, dtype: int64
X_train head        VH_20180101  VH_20180111  VH_20180121  VH_20180131  VH_20180210  \
89408   -17.420542   -17.807598   -18.702505   -18.363735   -17.056997   
89409   -18.150043   -17.900179   -19.867800   -18.586760   -17.160088   
89410   -15.249046   -16.180910   -17.6726

## Run a RandomizedSearchCV for Biome 2

In [None]:
#Grid Search for a set of parameters - without scaler - no need of pipeline?
#there is also the randomized search CV that will not run all the possibilities - we can use to get a first idea
t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

param_grid = { 
    'RFclf__n_estimators': [150,180,210,240,270,300,500,800,900,1000,1100,1200], #The number of trees in the forest.
    'RFclf__max_features': ['auto', 'sqrt', 'log2'], #The number of features to consider when looking for the best split:
    'RFclf__max_depth': [4,5,6,8,12,None],#The maximum depth of the tree
    'RFclf__min_samples_leaf': [1,2,3,4,8,10,12],#The minimum number of samples in newly created leaves.
    'RFclf__bootstrap': [0, 1],#Whether bootstrap samples are used when building trees.
    'RFclf__min_samples_split': [3, 5, 7], #The minimum number of samples required to split an internal node
    'RFclf__criterion': ['gini', 'entropy']#The function to measure the quality of a split
}

#Scaler = preprocessing.StandardScaler()
#Scaler.fit(TrF)
Pipeline = Pipeline([('RFclf', RandomForestClassifier())])
rfc = GridSearchCV(estimator=Pipeline, param_grid=param_grid,cv=3, n_jobs=100, verbose=1)
#rfc = BayesSearchCV(estimator=Pipeline, param_distributions =param_grid, n_iter=100,cv=3, n_jobs=50, verbose=1)
#rfc = RandomizedSearchCV(estimator=Pipeline, param_distributions =param_grid, n_iter=100,cv=3, n_jobs=38, verbose=1)

#RandomizedSearchCV
#rfc.fit(Scaler.transform(TrF), TrC)
rfc.fit(X_train, y_train)
print('Elapsed time for training: %.02f sec' % (time.time() - t))


pickle.dump(rfc, open(os.path.join(project_path,'RFmodel_LUCAS_'+str(biome)+'_'+str(level)+'_all-polygons_janv-jul2018_15122020best-smallgrid'), 'wb'))

print(rfc.best_estimator_.named_steps['RFclf'])

#best model
#print('best estimator',best_model)

#pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level1-mask_all-polygons_janv-jul2018_160920-best', 'wb'))

Method: Random Forest
Fitting 3 folds for each of 18144 candidates, totalling 54432 fits


[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 250 tasks      | elapsed: 131.2min
[Parallel(n_jobs=100)]: Done 600 tasks      | elapsed: 299.6min
[Parallel(n_jobs=100)]: Done 1050 tasks      | elapsed: 522.6min


In [23]:
#Grid Search for a set of parameters - takes time 
#there is also the randomized search CV that will not run all the possibilities - we can use to get a first idea
t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

param_grid = { 
    'RFclf__n_estimators': [25,50,75,100,130,150],
    'RFclf__max_features': ['auto', 'sqrt', 'log2'],
    'RFclf__max_depth': [4,5,6,8,12,None],
    'RFclf__min_samples_leaf': [1,2,3,4,8,10,12],
    'RFclf__bootstrap': [0, 1],
    'RFclf__min_samples_split': [3, 5, 7],
    'RFclf__criterion': ['gini', 'entropy']
}
#Scaler = preprocessing.StandardScaler()
#Scaler.fit(TrF)
Pipeline = Pipeline([('Scaler', preprocessing.StandardScaler()), ('RFclf', RandomForestClassifier())])
#rfc = GridSearchCV(estimator=Pipeline, param_grid=param_grid,cv=3, n_jobs=100, verbose=1)
rfc = RandomizedSearchCV(estimator=Pipeline, param_distributions =param_grid, n_iter=100,cv=3, n_jobs=38, verbose=1)
#RandomizedSearchCV
#rfc.fit(Scaler.transform(TrF), TrC)
rfc.fit(X_train, y_train)
print('Elapsed time for training: %.02f sec' % (time.time() - t))

#best model
#print('best estimator',rfc.best_estimator_.steps[1][1])
#pickle.dump(rfc.best_estimator_.steps[1][1], open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_janv-jul2018_24042020-best', 'wb'))
#not correct?
pickle.dump(rfc, open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_janv-jul2018_160920-allmodels', 'wb'))

best_model=rfc.best_estimator_.named_steps['RFclf']

#best model
print('best estimator',best_model)

pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_janv-jul2018_160920-best', 'wb'))

Method: Random Forest
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=38)]: Using backend LokyBackend with 38 concurrent workers.
[Parallel(n_jobs=38)]: Done 124 tasks      | elapsed: 10.4min
[Parallel(n_jobs=38)]: Done 300 out of 300 | elapsed: 29.4min finished


Elapsed time for training: 2263.73 sec
best estimator RandomForestClassifier(bootstrap=1, class_weight=None, criterion='entropy',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [None]:
best_model=rfc.best_estimator_.steps[1][1]
print('best estimator',best_model)

pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_jan-jun2018_15092020-best', 'wb'))

In [21]:
#best model
best_model=rfc.best_estimator_.steps[1][1]
print('best estimator',best_model)

pickle.dump(best_model, open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_jan-jun2018_29062020-best', 'wb'))

best estimator RandomForestClassifier(bootstrap=0, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [16]:
print(rfc)

RandomForestClassifier(bootstrap=0, class_weight=None, criterion='entropy',
            max_depth=12, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=40,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [9]:
pickle.dump(rfc.best_estimator_.steps[1][1], open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_janv-jul2018_24042020-best', 'wb'))

In [None]:
# Features importances for the best RF model from the Grid CV
Model=rfc

importances = Model.best_estimator_.steps[1][1].feature_importances_

std = np.std([tree.feature_importances_ for tree in Model.best_estimator_.steps[1][1]], axis=0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(44):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]),X_train.columns[indices[f]])


In [None]:
rfc.grid_scores_

In [11]:
#TEST WITH RANDOM PARAMETERS TO TEST FEATURES IMPORTANCE
#July
#split test and validation
X_trainP,X_testP,y_trainP,y_testP  = train_test_split(X_train,y_train, test_size=0.25,random_state=5)#,stratify=y_classP)

t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(bootstrap=0, criterion='gini', max_depth=None, max_features='sqrt', 
                             min_samples_leaf=4, min_samples_split=3, n_estimators=150, n_jobs=40)

rfc.fit(X_trainP.values, y_trainP.values)

print('Elapsed time for training: %.02f sec' % (time.time() - t))
#pickle.dump(clf, open('RFmodel_1x1_scenario2_test', 'wb'))


#accuracy
y_test_pred=rfc.predict(X_testP)      
y_test_pred_s=pd.Series(y_test_pred, dtype='float')
        
#to calculate accuracy, go back to array    
accuracy = 100.0*(y_testP.array == y_test_pred_s.array).sum()/X_testP.shape[0]
print('Accuracy is :' + str(round(accuracy,2)))

confusion_mat=confusion_matrix(y_testP,y_test_pred_s)
print (confusion_mat)

Method: Random Forest
Elapsed time for training: 51.28 sec
Accuracy is :86.86
[[29965     0   197    61     6   495     0     0     0     1     6     0
      0    34   106     0     6    34     4]
 [  665   312    34    30     0    20     0     0     0     0     0     0
      0     1     1     0     1     0     1]
 [ 1472     0  9447    64     2   216     0     1     0     1     6     0
      0     3    28     0     5    40     2]
 [  585     0   108  2733     8   167     0     0     0     1     1     0
      0     6    18     0     1    17     0]
 [  558     0    94    53  1730    98     0     2     0     0     0     0
      0     0     6     0     5    18     0]
 [  365     0    62    25     2 13174     0     0     0     0     6     0
      0    61    36     2     0    17     0]
 [    6     0     0     0     0    16    54     0     0     0     0     0
      0     4     0     0     0     6     0]
 [  703     0    66    97     2   102     0  1116     0     0     1     0
      0     1  

In [14]:
#load model
model = pickle.load(open('RFmodel_LUCAS_[2]_level2-crop_all-polygons_janv-jul2018','rb'))
print(model)

RandomForestClassifier(bootstrap=0, class_weight=None, criterion='entropy',
            max_depth=12, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [15]:
#July
t = time.time()
Method = 'Random Forest'
print('Method:', Method)
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(bootstrap=0, criterion='entropy', max_depth=12, max_features='sqrt', 
                             min_samples_leaf=4, min_samples_split=3, n_estimators=150, n_jobs=40)

rfc.fit(X_train.values, y_train.values)

print('Elapsed time for training: %.02f sec' % (time.time() - t))
#pickle.dump(clf, open('RFmodel_1x1_scenario2_test', 'wb'))

import pickle
pickle.dump(rfc, open('RFmodel_LUCAS_'+str(biome)+'_level2-crop_all-polygons_janv-jul2018bis', 'wb'))

Method: Random Forest
Elapsed time for training: 556.89 sec


In [None]:
sgd_randomized_pipe.named_steps['clf'].coef_

In [17]:
#save all the models tried


In [93]:
#to import in GEE
#index_column_VHVV=lucas.columns[76:148]
#index_column_class=lucas.columns[1:4]
#print(index_column_class)
#print(index_column_VHVV)
#create a csv with all the data
#lucas_gee=lucas.loc[:,index_column_class.union(index_column_VHVV)]
#lucas_gee.to_csv('lucas_crop_grass.csv')

In [30]:
#transform in byte
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler
#data=X_features

#scaler.data_max_=0
#scaler.data_min_=-25

#print(scaler.data_max_)
#scaler.fit(lucas_test)
#scaler.fit_transform(lucas_test)

In [59]:
#some test to avoid loading polygons csv
#goupby by class and polygon - give count of pixels per polygons
#test=pd_level2.groupby(['ClassL2','POINT_I'])['ClassL2','POINT_I'].count()
#print(test.head())
#how to count the number of polygon per class? level 0?
#lucasgroup=pd_level2.groupby(['ClassL2','POINT_I'])['POINT_I'].size()
#print(lucasgroup.head())

In [None]:
#implement with the biome class for the main crops we are interested
#+the 