# Generate Site Risk Scores

In [None]:
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import numpy as np
import requests
from tqdm.notebook import tqdm

In [None]:
endpoint = 'https://api.dev.plastic.watch.earthrise.media/sites'
response = requests.get(endpoint, params={'limit':1500})
data = response.json()
gdf = gpd.GeoDataFrame.from_features(data['features'])

In [None]:
variable_classes = ['Population - 1 km', 'Soil Clay Fraction', 'Distance to Waterway (m)', 'area']
stats = {}
for var in variable_classes:
    stats[var] = {}
    data = np.array(gdf[var].astype('float'))
    gdf[var] = data
    data = data[data != -1]
    print("min", data.min(), "max", data.max())
    if var == 'Soil Clay Fraction':
        pass
    if var == 'Distance to Waterway (m)':
        data = np.log(data + 10)
    else:
        data = np.log(data+0.1)
    plt.hist(data, bins=40)
    plt.title(var)
    plt.show()
    stdev = np.std(data)
    mean = np.mean(data)
    stats[var]['mean'] = mean
    stats[var]['std'] = stdev
stats

In [None]:
# Frozen stats. We'd use these if we want to run risk calculations on site ingest in the API
stats = {'Population - 1 km': {'mean': 7.417273534217326, 'std': 1.7791091219232005},
         'Soil Clay Fraction': {'mean': -0.885915538147242, 'std': 0.15838979230699146},
         'Distance to Waterway (m)': {'mean': 6.390775048206517, 'std': 1.179343111730756},
         'area': {'mean': 7.154989549277127, 'std': 1.9026796861924273}}

In [None]:
variable_classes = ['Population - 1 km', 'Soil Clay Fraction', 'Distance to Waterway (m)', 'area']
import matplotlib.pyplot as plt
risk_score = []
for site_index in range(len(gdf)):
    site = gdf.iloc[site_index]
    site_deviation = {}
    if site['area'] != 0.0 and site['Distance to Waterway (m)'] != -1:
        #print(site['area'], site['area'])
        for var in variable_classes:
            if var == 'Soil Clay Fraction':
                data = site[var].astype('float')
            if var == 'Distance to Waterway (m)':
                data = site[var].astype('float')
                data = np.log(data + 10)
            else:
                data = np.log(site[var].astype('float') + 0.01)
            #print(var, data)
            #print('average', var, stats[var]['mean'])
            #print('Deviation', (data - stats[var]['mean']) / stats[var]['std'])
            site_deviation[var] = (data - stats[var]['mean']) / stats[var]['std']
        risk = ((61 * site_deviation['area'] + 21 * site_deviation['Population - 1 km']) - (69 * site_deviation['Distance to Waterway (m)'] + 41 * site_deviation['Soil Clay Fraction'])) / (69 + 61 + 41 + 21)
        risk_score.append(risk)
    else:
        risk_score.append(np.nan)
risk_score = np.array(risk_score)
plt.hist(risk_score, bins=30)
plt.title('Raw Risk Scores')
plt.show()

In [None]:
normed_risk = (risk_score - np.nanmin(risk_score)) / (np.nanmax(risk_score) - np.nanmin(risk_score))
plt.hist(normed_risk[normed_risk > -5], bins=35)
plt.title('Normed Risk Scores')
plt.show()

In [None]:
new_gdf = gdf.copy()
new_gdf['risk'] = normed_risk
new_gdf.to_file('../data/site_metadata/compiled_risk.geojson', driver='GeoJSON')

In [None]:
import requests
import json
endpoint = 'http://api.dev.plastic.watch.earthrise.media/sites/'
for site in tqdm(new_gdf.iterfeatures(), total=(len(new_gdf))):
    site_id = site['properties']['id']
    site_endpoint = f"{endpoint}{site_id}"
    site['id'] = site_id
    r = requests.put(site_endpoint, json.dumps(site))