In [1]:
# To Match Karst to Tile
import numpy as np
import pandas as pd
import datetime
from datetime import datetime, timedelta

from shapely.geometry import Point, Polygon
import geopandas as gpd

# my script
from w210_attribute_library_scale import tilekey, haversine_distance, withinstates

datdir = "../data/"
attrs = "../attrs/"
modeld = "../model/"



### Read Karst Data

In [2]:
fkarst = 'karst_gridcode_raw_data.csv'
dfk = pd.read_csv(datdir+fkarst)
print(len(dfk))
dfk.head(1)

10311


Unnamed: 0.1,Unnamed: 0,Id,gridcode,Shape_Leng,Shape_Area,geometry,x_coord,y_coord
0,0,1,1,6000.0,2000000.0,POLYGON ((-122.15934291708528 48.9141451238035...,-122.169494,48.921667


#### Select Relevant Points

In [3]:
subdir = "../data/shapefile/"
shapedir = 'cb_2018_us_state_500k/'
shapefile500 = "cb_2018_us_state_500k.shp"

us500 = gpd.read_file(subdir+shapedir+shapefile500)

flgeometry = list(us500[(us500["NAME"]=='Florida')]["geometry"])[0]
gageometry = list(us500[(us500["NAME"]=='Georgia')]["geometry"])[0]
algeometry = list(us500[(us500["NAME"]=='Alabama')]["geometry"])[0]

geometries = [flgeometry, gageometry, algeometry ]
# geometries = [flgeometry]

# dfk["Florida"] = dfk.apply(lambda row: "FL" if (Point(row["x_coord"],row["y_coord"]).within(flgeometry)) else "NoFL", axis=1)

dfk["in_relevant_state"] = dfk.apply(lambda row: withinstates(geometries, Point(row["x_coord"],row["y_coord"])), axis=1)

dfk = dfk[dfk["in_relevant_state"] == "Yes"]
len(dfk)

2021

### Read Current Tile Data

In [4]:
ftileslarge = 'scale_up_F_satellite_ws_soil_attr.csv'
dftiles_large = pd.read_csv(attrs+ftileslarge)
dftiles_large.head(1)

Unnamed: 0.1,Unnamed: 0,Key,Calcium Carbonate,Gypsum,Soil Health Organic Matter,Percent Clay,Percent Sand,Percent Silt,Available Water Storage,date_ws,...,HerbaceousVegetation,Highway,Industrial,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name
0,0,033_174,0.0,0,6.177,10.1146,57.0835,8.9811,18.52059,2021-06-01,...,0.00055,3.476364e-08,1.73765e-16,1.376721e-14,3.401483e-10,6.664154e-13,0.054241,0.933093,9,SeaLake


In [5]:
dftiles_large.columns

Index(['Unnamed: 0', 'Key', 'Calcium Carbonate', 'Gypsum',
       'Soil Health Organic Matter', 'Percent Clay', 'Percent Sand',
       'Percent Silt', 'Available Water Storage', 'date_ws', 'name_x', 'lon_t',
       'lat_t', 'rolling_7_precip', 'rolling_15_precip', 'rolling_30_precip',
       'rolling_60_precip', 'rolling_90_precip', 'y1_mean_prc', 'y1_max_prc',
       'y1_mean_tmp', 'y1_max_tmp', 'y1_min_tmp', 'y2_mean_prc', 'y2_max_prc',
       'y2_mean_tmp', 'y2_max_tmp', 'y2_min_tmp', 'y3_mean_prc', 'y3_max_prc',
       'y3_mean_tmp', 'y3_max_tmp', 'y3_min_tmp', 'County', 'county_fp',
       'name_y', 'imagenum', 'geometry', 'AnnualCrop', 'Forest',
       'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture',
       'PermanentCrop', 'Residential', 'River', 'SeaLake', 'prediction',
       'prediction_name'],
      dtype='object')

In [6]:
dftiles_large = dftiles_large[['Key', 'Calcium Carbonate', 'Gypsum',
       'Soil Health Organic Matter', 'Percent Clay', 'Percent Sand',
       'Percent Silt', 'Available Water Storage', 'date_ws', 'name_x', 'lon_t',
       'lat_t', 'rolling_7_precip', 'rolling_15_precip', 'rolling_30_precip',
       'rolling_60_precip', 'rolling_90_precip', 'y1_mean_prc', 'y1_max_prc',
       'y1_mean_tmp', 'y1_max_tmp', 'y1_min_tmp', 'y2_mean_prc', 'y2_max_prc',
       'y2_mean_tmp', 'y2_max_tmp', 'y2_min_tmp', 'y3_mean_prc', 'y3_max_prc',
       'y3_mean_tmp', 'y3_max_tmp', 'y3_min_tmp', 'County', 'county_fp',
       'name_y', 'imagenum', 'geometry', 'AnnualCrop', 'Forest',
       'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture',
       'PermanentCrop', 'Residential', 'River', 'SeaLake', 'prediction',
       'prediction_name']]

In [7]:
# dft = dftiles_large.sample(frac=0.001)
# len(dft)

In [8]:
def findkarst(tile, dfk):
    
    i = 0
    
    for index, rkarst in dfk.iterrows():
        d = haversine_distance(tile['lat_t'], tile['lon_t'], rkarst["y_coord"], rkarst["x_coord"])
        if i == 0:
            minv = d
            kdata = rkarst["gridcode"]
            i += 1
        else:
            if d < minv:
                minv = d
                kdata = rkarst["gridcode"]
    
    return kdata

In [9]:
dftiles_large['gridcode'] = dftiles_large.apply(lambda row: findkarst(row, dfk), axis=1)

In [10]:
dftiles_large.to_csv("scale_up_F_satellite_ws_soil_karst_attr.csv", index=False)

In [11]:
dftiles_large.to_csv(attrs+"scale_up_F_satellite_ws_soil_karst_attr.csv", index=False)

In [13]:
dftiles_large["gridcode"].unique()

array([1, 2, 0, 3])

In [14]:
dftiles_large.head(1)

Unnamed: 0,Key,Calcium Carbonate,Gypsum,Soil Health Organic Matter,Percent Clay,Percent Sand,Percent Silt,Available Water Storage,date_ws,name_x,...,Highway,Industrial,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name,gridcode
0,033_174,0.0,0,6.177,10.1146,57.0835,8.9811,18.52059,2021-06-01,WHITING FIELD NAVAL AIR STATI,...,3.476364e-08,1.73765e-16,1.376721e-14,3.401483e-10,6.664154e-13,0.054241,0.933093,9,SeaLake,1


In [15]:
dftiles_large.columns

Index(['Key', 'Calcium Carbonate', 'Gypsum', 'Soil Health Organic Matter',
       'Percent Clay', 'Percent Sand', 'Percent Silt',
       'Available Water Storage', 'date_ws', 'name_x', 'lon_t', 'lat_t',
       'rolling_7_precip', 'rolling_15_precip', 'rolling_30_precip',
       'rolling_60_precip', 'rolling_90_precip', 'y1_mean_prc', 'y1_max_prc',
       'y1_mean_tmp', 'y1_max_tmp', 'y1_min_tmp', 'y2_mean_prc', 'y2_max_prc',
       'y2_mean_tmp', 'y2_max_tmp', 'y2_min_tmp', 'y3_mean_prc', 'y3_max_prc',
       'y3_mean_tmp', 'y3_max_tmp', 'y3_min_tmp', 'County', 'county_fp',
       'name_y', 'imagenum', 'geometry', 'AnnualCrop', 'Forest',
       'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture',
       'PermanentCrop', 'Residential', 'River', 'SeaLake', 'prediction',
       'prediction_name', 'gridcode'],
      dtype='object')