### intro
this demo shows how to preprocess the required satellite-derived features, including rescaling and ordering. 
A demo of how to load the trained model and perform prediction on each PFT is also attached at the end

please make sure: 
1. the testing features are at the same format, scale, order as the FeatDemo
2. the feature coord and id are consistent as you need to concat all features together for prediction
3. stored the coordinate info used for generating the raster after the prediction. 
4. The predictions can be named the same as the input raster, just add '_fCover_specificPFT' at the end
5. as the regression is performed on each PFT, suggest to finish the regression on all small areas for one PFT first, then go to the PFT


In [1]:
import numpy as np
import pandas as pd

import os
cwd = '/mnt/poseidon/remotesensing/arctic/data/training/Test_05/'
os.chdir(cwd)
os.getcwd

## make path to save the validation and remaining plots
result_path = cwd + 'results/' + 'map' 
if not os.path.exists(result_path):
    os.makedirs(result_path)

#### load an example multispectra file
'source' is useless, the following only works on the spectral band. and the site code is used for joining different features

In [3]:
spectra = pd.read_csv(cwd + 'features/child/child_S2_summer_2019_med_30m.csv')
spectra['Site Code'] = 'id_' + spectra['Site Code'].astype(str)  
spectra.columns 

Index(['Site Code', 'B11_median', 'B12_median', 'B2_median', 'B3_median',
       'B4_median', 'B5_median', 'B6_median', 'B7_median', 'B8_median',
       'B8A_median', 'source'],
      dtype='object')

#### rescale and rename the spectral band of S2

In [4]:
band = ['B2_median', 'B3_median', 'B4_median', 'B5_median', 'B6_median', 'B7_median', 'B8A_median', 'B8_median',
        'B11_median', 'B12_median']

# remote any entry with no band info, skip this if you have filled every empty entry.
spectra = spectra.dropna(subset=band, how='any')  

#rescaled to 0-1, check the original data to ensure it has not been scaled yet.
spectra[band] = spectra[band]*0.0001 
spectra_all = spectra.copy()[band]

# rename the band2-11 to specific band
new_cols = ['blue','green','red','redEdge1','redEdge2', 'redEdge3', 'redEdge4', 'nir', 'swir1', 'swir2']
spectra_all.rename(columns={col: new_cols[i] for i, col in enumerate(spectra_all.columns)}, inplace=True)
spectra_all.columns

display(spectra_all.head(5))

Unnamed: 0,blue,green,red,redEdge1,redEdge2,redEdge3,redEdge4,nir,swir1,swir2
0,0.03585,0.0617,0.06755,0.1329,0.24025,0.2721,0.3031,0.2941,0.2326,0.12425
1,0.044,0.0633,0.0718,0.1148,0.1781,0.2075,0.2404,0.2264,0.272,0.161
2,0.0457,0.05435,0.0597,0.0648,0.0676,0.0696,0.07425,0.07525,0.2162,0.1794
3,0.03725,0.0671,0.07335,0.14105,0.2697,0.3024,0.3375,0.3294,0.2417,0.12085
4,0.0448,0.05825,0.0604,0.09495,0.1315,0.1511,0.18205,0.1782,0.12495,0.08105


#### vegetation indices used for adding feature info

In [5]:
def vegIDXcal(vegIDX):
    nir = vegIDX['nir']
    red = vegIDX['red']
    green = vegIDX['green']
    blue = vegIDX['blue']
    swir1 = vegIDX['swir1']
    swir2 = vegIDX['swir2']
    redEdge1 = vegIDX['redEdge1']
    
    vegIDX['ndwi1'] = (nir - swir1)/(nir + swir2)
    vegIDX['ndwi2'] = (nir - swir2)/(nir + swir2)
    vegIDX['msavi'] = (2*nir + 1 -  ((2*nir + 1)**2 - 8*(nir - red))**0.5) * 0.5
    vegIDX['vari'] = (green - red)/(green + red - blue);
    vegIDX['rvi'] = nir/red
    vegIDX['osavi'] = 1.16 * (nir - red)/(nir + red + 0.16)
    vegIDX['tgi'] = (120 * (red - blue) - 190 * (red - green))*0.5
    vegIDX['gli'] = (2 * green - red - blue)/(2 * green + red + blue)
    vegIDX['ngrdi'] = (green - red)/(green + red)
    vegIDX['ci_g'] = nir/green - 1
    vegIDX['gNDVI'] = (nir - green)/(nir + green)
    vegIDX['cvi'] = (nir * red)/(green ** 2)
    vegIDX['mtvi2'] = 1.5*(1.2*(nir - green) - 2.5*(red - green))/(((2*nir + 1)**2 - (6*nir - 5*(red**0.5))-0.5)**0.5)

    vegIDX['brightness'] = 0.3037 * blue +0.2793 * green +0.4743 * red +0.5585 * nir +0.5082 * swir1 + 0.1863 * swir2
    vegIDX['greenness'] = 0.7243 * nir +0.0840 * swir1 - 0.2848 * blue - 0.2435 * green - 0.5436 * red - 0.1800 * swir2 	
    vegIDX['wetness'] = 0.1509 * blue+0.1973* green+0.3279*red+0.3406*nir-0.7112*swir1 - 0.4572*swir2
    vegIDX['tcari'] = 3 * ((redEdge1 -red)-0.2 * (redEdge1- green)*(redEdge1/red))
    vegIDX['tci'] = 1.2 * (redEdge1 - green)- 1.5 * (red - green)*((redEdge1/red)**0.5)
    vegIDX['nari'] = (1/green - 1/redEdge1)/(1/green + 1/redEdge1)
    return vegIDX

#vegIDX should contain both the original spectral band and the calculated indices
vegIDX = vegIDXcal(spectra_all) 

In [6]:
#--- store the site code info for matching different feature sources in case you may filter out some entries
vegIDX['Site Code'] = spectra['Site Code']  
display(vegIDX.head(5))
print(vegIDX.columns) 

Unnamed: 0,blue,green,red,redEdge1,redEdge2,redEdge3,redEdge4,nir,swir1,swir2,...,gNDVI,cvi,mtvi2,brightness,greenness,wetness,tcari,tci,nari,Site Code
0,0.03585,0.0617,0.06755,0.1329,0.24025,0.2721,0.3031,0.2941,0.2326,0.12425,...,0.653176,5.218552,0.317635,0.365769,0.148236,-0.082329,0.112001,0.073132,0.365879,id_FMTT-5B
1,0.044,0.0633,0.0718,0.1148,0.1781,0.2075,0.2404,0.2264,0.272,0.161,...,0.562996,4.056892,0.207415,0.359766,0.090874,-0.147272,0.079594,0.045678,0.289163,id_FWMM-5B
2,0.0457,0.05435,0.0597,0.0648,0.0676,0.0696,0.07425,0.07525,0.2162,0.1794,...,0.161265,1.520834,0.013907,0.242697,-0.01833,-0.172958,0.008494,0.004179,0.087705,id_AB-1B
3,0.03725,0.0671,0.07335,0.14105,0.2697,0.3024,0.3375,0.3294,0.2417,0.12085,...,0.661538,5.366346,0.351518,0.39416,0.170314,-0.072045,0.117778,0.07574,0.355273,id_FMTT-1
4,0.0448,0.05825,0.0604,0.09495,0.1315,0.1511,0.18205,0.1782,0.12495,0.08105,...,0.507295,3.172143,0.169738,0.236647,0.065201,-0.027167,0.069034,0.039996,0.239556,id_AFMM-5


Index(['blue', 'green', 'red', 'redEdge1', 'redEdge2', 'redEdge3', 'redEdge4',
       'nir', 'swir1', 'swir2', 'ndwi1', 'ndwi2', 'msavi', 'vari', 'rvi',
       'osavi', 'tgi', 'gli', 'ngrdi', 'ci_g', 'gNDVI', 'cvi', 'mtvi2',
       'brightness', 'greenness', 'wetness', 'tcari', 'tci', 'nari',
       'Site Code'],
      dtype='object')


In [15]:
# add the S1 and topo info to the S2
### Sentnel-1 features
plot_S1 = pd.read_csv(cwd + 'features/child/child_S1_summer_2019_med_30m.csv')
plot_S1 = plot_S1.rename(columns={"VV_median": "VV", "VH_median": "VH"})
plot_S1['Site Code'] = 'id_' + plot_S1['Site Code'].astype(str)

### Topo features
plot_topo = pd.read_csv(cwd + 'features/child/child_topo_summer_2019_med_30m.csv')
plot_topo['Site Code'] = 'id_' + plot_topo['Site Code'].astype(str)

### -- merge features
plot_fea = vegIDX.merge(plot_S1[['Site Code','VH', 'VV']], how='inner', on='Site Code')
plot_fea = plot_fea.merge(plot_topo[['Site Code','aspect', 'elevation', 'hillshade', 'slope']], how='inner', on='Site Code')

In [16]:
## drop 'site code' for features
plot_fea = plot_fea.drop(columns = 'Site Code')
print(plot_fea.columns)
display(plot_fea.head(5))

Index(['blue', 'green', 'red', 'redEdge1', 'redEdge2', 'redEdge3', 'redEdge4',
       'nir', 'swir1', 'swir2', 'ndwi1', 'ndwi2', 'msavi', 'vari', 'rvi',
       'osavi', 'tgi', 'gli', 'ngrdi', 'ci_g', 'gNDVI', 'cvi', 'mtvi2',
       'brightness', 'greenness', 'wetness', 'tcari', 'tci', 'nari', 'VH',
       'VV', 'aspect', 'elevation', 'hillshade', 'slope'],
      dtype='object')


Unnamed: 0,blue,green,red,redEdge1,redEdge2,redEdge3,redEdge4,nir,swir1,swir2,...,wetness,tcari,tci,nari,VH,VV,aspect,elevation,hillshade,slope
0,0.03585,0.0617,0.06755,0.1329,0.24025,0.2721,0.3031,0.2941,0.2326,0.12425,...,-0.082329,0.112001,0.073132,0.365879,-19.742621,-12.209516,244.309968,306.661907,181.0,1.0
1,0.044,0.0633,0.0718,0.1148,0.1781,0.2075,0.2404,0.2264,0.272,0.161,...,-0.147272,0.079594,0.045678,0.289163,-18.216437,-10.824382,288.143577,304.142685,181.0,1.0
2,0.0457,0.05435,0.0597,0.0648,0.0676,0.0696,0.07425,0.07525,0.2162,0.1794,...,-0.172958,0.008494,0.004179,0.087705,-6.713143,1.9467,139.0,995.368373,123.0,29.0
3,0.03725,0.0671,0.07335,0.14105,0.2697,0.3024,0.3375,0.3294,0.2417,0.12085,...,-0.072045,0.117778,0.07574,0.355273,-17.224668,-11.150553,164.0,268.963409,178.0,2.0
4,0.0448,0.05825,0.0604,0.09495,0.1315,0.1511,0.18205,0.1782,0.12495,0.08105,...,-0.027167,0.069034,0.039996,0.239556,-21.697856,-10.79554,222.630564,36.311274,181.103903,0.0


#### check if the generated features consistent with the FeatureDemo

In [13]:
FeaDemo = pd.read_csv(cwd +'results/ModelTuning_FeatureImportance/' + 'FeatureDemo_non-vascular.csv').drop(columns= 'Unnamed: 0')
print(FeaDemo.columns)
display(FeaDemo.head(5))

Index(['blue', 'green', 'red', 'redEdge1', 'redEdge2', 'redEdge3', 'redEdge4',
       'nir', 'swir1', 'swir2', 'ndwi1', 'ndwi2', 'msavi', 'vari', 'rvi',
       'osavi', 'tgi', 'gli', 'ngrdi', 'ci_g', 'gNDVI', 'cvi', 'mtvi2',
       'brightness', 'greenness', 'wetness', 'tcari', 'tci', 'nari', 'VH',
       'VV', 'aspect', 'elevation', 'hillshade', 'slope'],
      dtype='object')


Unnamed: 0,blue,green,red,redEdge1,redEdge2,redEdge3,redEdge4,nir,swir1,swir2,...,wetness,tcari,tci,nari,VH,VV,aspect,elevation,hillshade,slope
0,0.03585,0.0617,0.06755,0.1329,0.24025,0.2721,0.3031,0.2941,0.2326,0.12425,...,-0.082329,0.112001,0.073132,0.365879,-19.742621,-12.209516,244.309968,306.661907,181.0,1.0
1,0.044,0.0633,0.0718,0.1148,0.1781,0.2075,0.2404,0.2264,0.272,0.161,...,-0.147272,0.079594,0.045678,0.289163,-18.216437,-10.824382,288.143577,304.142685,181.0,1.0
2,0.03725,0.0671,0.07335,0.14105,0.2697,0.3024,0.3375,0.3294,0.2417,0.12085,...,-0.072045,0.117778,0.07574,0.355273,-17.224668,-11.150553,164.0,268.963409,178.0,2.0
3,0.0363,0.0481,0.0472,0.0748,0.1052,0.1214,0.1336,0.1236,0.1059,0.0628,...,-0.031485,0.057412,0.033739,0.21725,-21.31111,-13.646969,254.555361,44.055236,181.0,1.0
4,0.0338,0.04665,0.05115,0.0915,0.1372,0.15335,0.16445,0.1623,0.14215,0.08425,...,-0.05326,0.072912,0.044792,0.324647,-25.869451,-13.666498,234.670013,-1.905279,181.0,1.0


In [20]:
print(sum(FeaDemo.columns != plot_fea.columns))
print(FeaDemo.shape[1] == plot_fea.shape[1])

0
True


#### below shows how to load the pickled model for each PFT and do prediction

In [21]:
# import sys
# !{sys.executable} -m pip install pickle

import pickle

# # Load the pickled model from the file
PFT = 'non-vascular'
result_path = cwd +'results/ModelTuning_FeatureImportance/'
model_file_path = os.path.join(result_path, 'tunedModel_{}.pkl'.format(PFT))
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)

# --- prediction directly used the model
fCover = model.predict(plot_fea)   #fCover is a 1 by n 
fCover.shape