In [1]:
# To Match Karst to Tile
import numpy as np
import pandas as pd
import datetime
from datetime import datetime, timedelta

from shapely.geometry import Point, Polygon
import geopandas as gpd

# my script
from w210_attribute_library_scale import tilekey, haversine_distance, withinstates

datdir = "../data/"
attrs = "../attrs/"
modeld = "../model/"



## Read Karst Data

In [2]:
fkarst = 'karst_gridcode_raw_data.csv'
dfk = pd.read_csv(datdir+fkarst)
print(len(dfk))
dfk.head(1)

10311


Unnamed: 0.1,Unnamed: 0,Id,gridcode,Shape_Leng,Shape_Area,geometry,x_coord,y_coord
0,0,1,1,6000.0,2000000.0,POLYGON ((-122.15934291708528 48.9141451238035...,-122.169494,48.921667


### Select Point within Florida Only

In [3]:
subdir = "../data/shapefile/"
shapedir = 'cb_2018_us_state_500k/'
shapefile500 = "cb_2018_us_state_500k.shp"

us500 = gpd.read_file(subdir+shapedir+shapefile500)

flgeometry = list(us500[(us500["NAME"]=='Florida')]["geometry"])[0]
gageometry = list(us500[(us500["NAME"]=='Georgia')]["geometry"])[0]
algeometry = list(us500[(us500["NAME"]=='Alabama')]["geometry"])[0]

# geometries = [flgeometry, gageometry, algeometry ]
geometries = [flgeometry]

# dfk["Florida"] = dfk.apply(lambda row: "FL" if (Point(row["x_coord"],row["y_coord"]).within(flgeometry)) else "NoFL", axis=1)

dfk["in_relevant_state"] = dfk.apply(lambda row: withinstates(geometries, Point(row["x_coord"],row["y_coord"])), axis=1)

dfk = dfk[dfk["in_relevant_state"] == "Yes"]
len(dfk)

1876

## Read Tile Data

In [4]:
ftileslarge = 'scale_up_F_tiles_ws_attr.csv'
dftiles_large = pd.read_csv(attrs+ftileslarge)
dftiles_large.head(1)

Unnamed: 0,Key,Key_y,date_ws,name,lon_t,lat_t,lon_w,lat_w,Distance,rolling_7_precip,...,y2_mean_prc,y2_max_prc,y2_mean_tmp,y2_max_tmp,y2_min_tmp,y3_mean_prc,y3_max_prc,y3_mean_tmp,y3_max_tmp,y3_min_tmp
0,033_174,720383_53847,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.624327,30.880381,-87.023,30.704,37.756111,0.63,...,0.093381,2.82,69.399178,85.9,35.5,0.173123,3.03,67.569589,85.3,37.6


In [5]:
print(len(dftiles_large))
dftiles_large.columns

407463


Index(['Key', 'Key_y', 'date_ws', 'name', 'lon_t', 'lat_t', 'lon_w', 'lat_w',
       'Distance', 'rolling_7_precip', 'rolling_15_precip',
       'rolling_30_precip', 'rolling_60_precip', 'rolling_90_precip',
       'y1_mean_prc', 'y1_max_prc', 'y1_mean_tmp', 'y1_max_tmp', 'y1_min_tmp',
       'y2_mean_prc', 'y2_max_prc', 'y2_mean_tmp', 'y2_max_tmp', 'y2_min_tmp',
       'y3_mean_prc', 'y3_max_prc', 'y3_mean_tmp', 'y3_max_tmp', 'y3_min_tmp'],
      dtype='object')

In [6]:
dftiles_large = dftiles_large[['Key', 'date_ws', 'name', 'lon_t', 'lat_t', 
        'rolling_7_precip', 'rolling_15_precip',
       'rolling_30_precip', 'rolling_60_precip', 'rolling_90_precip',
       'y1_mean_prc', 'y1_max_prc', 'y1_mean_tmp', 'y1_max_tmp', 'y1_min_tmp',
       'y2_mean_prc', 'y2_max_prc', 'y2_mean_tmp', 'y2_max_tmp', 'y2_min_tmp',
       'y3_mean_prc', 'y3_max_prc', 'y3_mean_tmp', 'y3_max_tmp', 'y3_min_tmp']]
dftiles_large.head(1)

Unnamed: 0,Key,date_ws,name,lon_t,lat_t,rolling_7_precip,rolling_15_precip,rolling_30_precip,rolling_60_precip,rolling_90_precip,...,y2_mean_prc,y2_max_prc,y2_mean_tmp,y2_max_tmp,y2_min_tmp,y3_mean_prc,y3_max_prc,y3_mean_tmp,y3_max_tmp,y3_min_tmp
0,033_174,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.624327,30.880381,0.63,0.63,0.63,0.91,1.58,...,0.093381,2.82,69.399178,85.9,35.5,0.173123,3.03,67.569589,85.3,37.6


In [7]:
fb = 'scale_up_F_satellite_key_county.csv'
dfb = pd.read_csv(attrs+fb)
dfb.head(1)

Unnamed: 0.1,Unnamed: 0,name,imagenum,label,lon,lat,start_date,geometry,AnnualCrop,Forest,...,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name,county_fp,Key,County
0,174,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,0,-87.624327,30.880381,2022-05-06,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,...,1.376721e-14,3.401483e-10,6.664154e-13,0.054241,0.933093,9,SeaLake,33,033_174,Escambia


In [8]:
print(len(dfb))
dfb.columns

407463


Index(['Unnamed: 0', 'name', 'imagenum', 'label', 'lon', 'lat', 'start_date',
       'geometry', 'AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway',
       'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River',
       'SeaLake', 'prediction', 'prediction_name', 'county_fp', 'Key',
       'County'],
      dtype='object')

In [9]:
dfb = dfb[['Key', 'County', 'county_fp', 'name', 'imagenum',
       'geometry', 'AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway',
       'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River',
       'SeaLake', 'prediction', 'prediction_name']]
dfb.head(1)

Unnamed: 0,Key,County,county_fp,name,imagenum,geometry,AnnualCrop,Forest,HerbaceousVegetation,Highway,Industrial,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name
0,033_174,Escambia,33,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,0.00055,3.476364e-08,1.73765e-16,1.376721e-14,3.401483e-10,6.664154e-13,0.054241,0.933093,9,SeaLake


In [10]:
dftiles_large = pd.merge(dftiles_large, dfb, on="Key")
len(dftiles_large)
dftiles_large.head()

Unnamed: 0,Key,date_ws,name_x,lon_t,lat_t,rolling_7_precip,rolling_15_precip,rolling_30_precip,rolling_60_precip,rolling_90_precip,...,HerbaceousVegetation,Highway,Industrial,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name
0,033_174,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.624327,30.880381,0.63,0.63,0.63,0.91,1.58,...,0.00055,3.476364e-08,1.73765e-16,1.376721e-14,3.401483e-10,6.664154e-13,0.05424127,0.9330934,9,SeaLake
1,033_206,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.618608,30.897536,0.63,0.63,0.63,0.91,1.58,...,0.009805,1.29038e-08,2.3333180000000002e-17,3.809984e-13,4.369865e-11,7.21356e-14,0.09441625,0.01678774,1,Forest
2,033_207,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.618608,30.891818,0.63,0.63,0.63,0.91,1.58,...,6e-06,4.183348e-12,5.709511e-20,7.383366e-18,1.944085e-10,3.115901e-15,4.953628e-08,3.304344e-08,1,Forest
3,033_208,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.618608,30.886099,0.63,0.63,0.63,0.91,1.58,...,0.064912,1.422953e-07,3.0949230000000006e-17,1.189291e-13,3.624084e-10,5.215894e-14,0.1054994,0.4844987,9,SeaLake
4,033_209,2021-06-01,WHITING FIELD NAVAL AIR STATI,-87.618608,30.880381,0.63,0.63,0.63,0.91,1.58,...,9.3e-05,9.266508e-08,6.735141e-17,2.489165e-16,3.240689e-11,2.448823e-12,0.02693079,0.9702568,9,SeaLake


In [11]:
dftiles_large.columns

Index(['Key', 'date_ws', 'name_x', 'lon_t', 'lat_t', 'rolling_7_precip',
       'rolling_15_precip', 'rolling_30_precip', 'rolling_60_precip',
       'rolling_90_precip', 'y1_mean_prc', 'y1_max_prc', 'y1_mean_tmp',
       'y1_max_tmp', 'y1_min_tmp', 'y2_mean_prc', 'y2_max_prc', 'y2_mean_tmp',
       'y2_max_tmp', 'y2_min_tmp', 'y3_mean_prc', 'y3_max_prc', 'y3_mean_tmp',
       'y3_max_tmp', 'y3_min_tmp', 'County', 'county_fp', 'name_y', 'imagenum',
       'geometry', 'AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway',
       'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River',
       'SeaLake', 'prediction', 'prediction_name'],
      dtype='object')

In [12]:
# dftiles_large["Key"] = dftiles_large.apply(lambda row: tilekey(row), axis=1)
# dftiles_large = dftiles_large[["Key", "lon", "lat", "County", "CountyFP"]]
# dftiles_large.columns = ["Key", "lon_t", "lat_t", "County", "CountyFp"]

# print(len(dftiles_large))
# dftiles_large.head(2)

## Merge Tile and Karst for Distance Calculation

`result = pd.merge(left, right, how="left", on=["key1", "key2"])`

**Reference:**
https://pandas.pydata.org/docs/user_guide/merging.html

In [None]:
dflarge = pd.merge(dftiles_large,dfk, how="cross")
print(len(dflarge))
dflarge.head(1)

## Calculate The Distance

In [None]:
dflarge['Distance'] = dflarge.apply(lambda row: 
                                    haversine_distance(row['lat_t'], row['lon_t'], 
                                                       row['y_coord'], row['x_coord'], 
                                                       earth_radius=3963.19), axis=1)
dflarge.head(2)

## Find the Minimum Distances within Tile and Weather Station

`df.groupby('Company')['MPG'].agg('min')`  
`df.groupby('Company')[['MPG', 'EngineSize']].agg('min')`

**Reference:**
https://datascienceparichay.com/article/pandas-groupby-minimum/

In [None]:
# Find the rows with the minimum Distance for each Key_x
dfmin1 = dflarge.groupby(['Key'])['Distance'].min().to_frame()
print(len(dfmin1))

# Select only the rows with the minimum
keysL = list(dfmin1.index)
minD = list(dfmin1['Distance'])
dfF1 = dflarge[((dflarge['Key'].isin(keysL)) &  (dflarge['Distance'].isin(minD)))]

**Reference Duplicates:**  
https://sparkbyexamples.com/pandas/pandas-get-list-of-all-duplicate-rows/#:~:text=Pandas%20DataFrame.,multiple%20columns%20or%20all%20columns.

In [None]:
#Checking for Duplicates
df2 = dfF1[dfF1["Key"].duplicated()==True]
dup1 = df2["Key"].unique()
# dup1 = ['1082_0_1','1083_0_1', '2406_0_1', '2459_0_1', '2463_0_1', '2737_0_1', '3294_0_2', '3294_1_0', '556_0_1']
dfF1[(dfF1['Key'].isin(dup1))]

In [None]:
dfF1.drop_duplicates(subset=['Key'], inplace=True)
dfF1["Distance"].describe()

In [None]:
dfF1["gridcode"].unique()

In [None]:
dfF1.to_csv(attrs+"scale_up_F_tile_karst.csv", index=False)