In [2]:
import ee
import numpy as np
import pandas as pd
import geopandas as gpd
from earthshot import mon_stats
from earthshot import water_viz as vis
from statistics import mean

In [3]:
ee.Initialize()

In [4]:
#Function to convert FeatureCollection into DataFrame
def fc2df(fc):
    # Convert a FeatureCollection into a pandas DataFrame
    # Features is a list of dict with the output
    features = fc.getInfo()['features']

    dictarr = []

    for f in features:
        # Store all attributes in a dict
        attr = f['properties']
        # and treat geometry separately
        attr['geometry'] = f['geometry']  # GeoJSON Feature!
        # attr['geometrytype'] = f['geometry']['type']
        dictarr.append(attr)

    df = gpd.GeoDataFrame(dictarr)
    # Convert GeoJSON features to shape
    df['geometry'] = map(lambda s: np.shape(s), df.geometry)    
    return df

In [5]:
#Function to convert an image into featurecollection into Dataframe for ML processing
def extract_point_values(img_name, pts):
    image = img_name

    fc_image_red = image.reduceRegions(collection=pts,
                                  reducer=ee.Reducer.mean(),
                                  scale=30)

    # Convert to Pandas Dataframe
    df_image_red = fc2df(fc_image_red)


    return df_image_red

In [6]:
#Import Global MAR Inventory
MAR_swales = ee.FeatureCollection('users/amgadellaboudy/Global_MAR_Inventory')

In [7]:
#Import variables in (Slope, Porosity, Runoff, Soil Types), restrict global MAR locations
slope_img = ee.Image('users/jamesmcc/merit_slope/merit_terrain_slope').clip(MAR_swales)

smap_usda_clim = ee.ImageCollection('users/jamesmcc/smap_usda_climatology')
avail_porosity = (smap_usda_clim
                  .filter(ee.Filter.eq('band', 'avail_porosity_mm')))
avail_porosity_img = avail_porosity.sum()
avail_porosity_img= avail_porosity_img.clip(MAR_swales)

runoff_clim = ee.ImageCollection("ECMWF/ERA5_LAND/MONTHLY").select('surface_runoff')
runoff_clim_m = mon_stats.bands_avgs(['surface_runoff'], runoff_clim)
runoff_img = ee.ImageCollection(runoff_clim_m['avgs'].get('surface_runoff')).sum().multiply(720)
runoff_img = runoff_img.clip(MAR_swales)

soil_types = ee.Image("OpenLandMap/SOL/SOL_TEXTURE-CLASS_USDA-TT_M/v02")

top_soils = soil_types.expression('soil_0 + soil_10 + soil_30',
                             {'soil_0': soil_types.select('b0'),
                             'soil_10': soil_types.select('b10'),
                             'soil_30': soil_types.select('b30')})

bottom_soils = soil_types.expression('soil_60 + soil_100 + soil_200',
                                 {'soil_60': soil_types.select('b60'),
                                  'soil_100': soil_types.select('b100'),
                                  'soil_200': soil_types.select('b200')})

top_soil_img = top_soils.clip(MAR_swales)
bottom_soil_img = bottom_soils.clip(MAR_swales)

In [8]:
#Create dataframes out of each variable, concatenate them to create dataframe for ML

df_slope = extract_point_values(slope_img, MAR_swales)
df_slope.rename(columns = {'mean' : 'Slope'}, inplace = True)

porosity = extract_point_values(avail_porosity_img, MAR_swales)
porosity.rename(columns = {'mean' : 'Porosity'}, inplace = True)
porosity = porosity['Porosity']

runoff = extract_point_values(runoff_img, MAR_swales)
runoff.rename(columns = {'mean' : 'Runoff'}, inplace = True)
runoff = runoff['Runoff']

top_soil = extract_point_values(top_soil_img, MAR_swales)
top_soil.rename(columns = {'mean' : 'Top Soils'}, inplace = True)
top_soil = top_soil['Top Soils']

bottom_soil = extract_point_values(bottom_soil_img, MAR_swales)
bottom_soil.rename(columns = {'mean' : 'Bottom Soils'}, inplace = True)
bottom_soil = bottom_soil['Bottom Soils']

df_all = pd.concat([df_slope, porosity, runoff, top_soil, bottom_soil], axis = 1)

df_all.head()



Unnamed: 0,continent,country,fid,final_use,id,influent_s,latitude,link_to_do,longitude,main_mar_t,...,reference_,site_name,specific_m,year_opera,year_shut_,geometry,Porosity,Runoff,Top Soils,Bottom Soils
0,Africa,Algeria,1125,Agricultural,1341,River water,36.538316,no data,3.061936,Spreading Methods,...,ANRH 2012,La Mitidja Big Bag,no data,2002,-9999,<map object at 0x7fcceca641d0>,,0.447128,21.0,15.0
1,North America,USA,1028,no data,1098,no data,35.372068,Reverse Drainage,-106.663641,Spreading Methods,...,"<a href=""http://www.nwri-usa.org/pdfs/MoorePre...",Mariposa Water Reclamation Facility Surface In...,Reverse Drainage,-9999,-9999,<map object at 0x7fcceca641d0>,890.502527,0.045984,18.0,18.0
2,North America,USA,1064,Agricultural,1134,River water,45.945574,Reverse Drainage,-118.410467,Spreading Methods,...,"<a href=""http://wwbwc.org/aquifer-recharge-rep...",Anspach AR site,Reverse Drainage,2012,-9999,<map object at 0x7fcceca641d0>,1184.378527,0.246399,21.0,18.0
3,Oceania,Australia,660,Domestic,718,Storm water,-33.918488,Reverse Drainage,151.227858,Spreading Methods,...,"<a href=""http://www.connectedwaters.unsw.edu.a...",Village Green/ Kensington University,Reverse Drainage,2006,-9999,<map object at 0x7fcceca641d0>,1088.37479,0.135354,24.0,18.0
4,Oceania,Australia,654,Ecological,712,Reclaimed wastewater,-31.94378,Reverse Drainage,115.780462,Spreading Methods,...,"<a href=""http://www.clw.csiro.au/publications/...",Perry Lakes,Reverse Drainage,2008,-9999,<map object at 0x7fcceca641d0>,,0.143023,24.0,18.0


In [9]:
#Setup Random Forest Classifier model
from sklearn.model_selection import train_test_split
df_new = df_all[['latitude','longitude','Slope','Porosity','Runoff','Top Soils','Bottom Soils','main_mar_t']].dropna()
X = df_new.drop('main_mar_t', axis =1)
y = df_new['main_mar_t']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
#Test model for accuracy
rfc_pred = rfc.predict(X_test)

In [23]:
#Report precision, recall, f1-score results
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,rfc_pred))

                                   precision    recall  f1-score   support

          In-Channel Modification       0.73      0.82      0.77        33
          Induced Bank Filtration       0.63      0.85      0.72        26
 Rainwater and Run-off Harvesting       0.67      0.80      0.73         5
                Spreading Methods       0.64      0.58      0.61        50
Well, Shaft and Borehole Recharge       0.75      0.65      0.69        65

                         accuracy                           0.69       179
                        macro avg       0.68      0.74      0.70       179
                     weighted avg       0.70      0.69      0.69       179

