In [1]:
# To Match WS to Tile
import numpy as np
import pandas as pd
import datetime
from datetime import datetime, timedelta

# my script
from w210_attribute_library_scale import tilekey, haversine_distance, date_string

print("Date of reference: ", date_string)

Date of reference:  2022/06/01


In [2]:
datdir = "../data/"
attrs = "../attrs/"
modeld = "../model/"

fweather = 'weather_data.csv'
dfw = pd.read_csv(datdir+fweather)
print(len(dfw))
dfw.head(1)

291511


Unnamed: 0,stn_wban,year,mo,da,min,max,temp,prcp,flag_prcp,rain_drizzle,...,usaf_wban,wban,lon,lat,elev,name,begin,end,state,country
0,998199_99999,2016,8,12,77.9,84.9,81.5,0.0,I,0,...,998199_99999,99999,-85.667,30.15,5.0,PANAMA CITY,20080721,20210920,FL,US


### Check and Select Working/Valid Weather Stations

In [3]:
wskey = dfw["usaf_wban"].unique()

dated = datetime.strptime(date_string,"%Y/%m/%d")

dfw["DateD"] = dfw.apply(lambda row: datetime.strptime(str(row["year"])+"-"+str(row["mo"])+"-"+str(row["da"]),"%Y-%m-%d") ,axis=1)

i = 0
wsgood = []
wsnogood = []
for wsname in wskey:
    td = timedelta(365*3)
    dftemp = dfw[(dfw["usaf_wban"] == wsname)]
    if (dftemp["DateD"].max() > dated) & (dftemp["DateD"].min() < dated - td):
        wsgood.append(wsname)
    else:
        wsnogood.append(wsname)

In [4]:
wsnogood_set = set(wsnogood)
wsnogood_list = list(wsnogood_set)
print(wsnogood_list)

dfw = dfw[~dfw["usaf_wban"].isin(wsnogood_list)]
print(len(wskey), len(dfw["usaf_wban"].unique()))

ws = dfw[['stn_wban', "name", "lon", "lat"]].drop_duplicates()
ws.columns = ["Key", 'name', 'lon_w', 'lat_w']
print(len(ws))

['998305_99999', '999999_63890', '999999_53848', '722118_99999', '722261_99999', '720374_92825', '690524_99999', '749048_99999', '722012_92817', '997733_99999', '747761_99999', '998199_99999', '693254_99999', '722224_53862', '747940_99999']
126 111
111


**For Reference - list of no-good weather stations**  
`
['722261_99999', '999999_53848', '998199_99999', '747940_99999', '722118_99999', '693254_99999', '720374_92825', '998305_99999', '690524_99999', '722012_92817', '722224_53862', '749048_99999', '999999_63890', '997733_99999', '747761_99999']
`

In [5]:
duplicateName = ['WHITING FIELD NAVAL AIR STATI', 'KEYSTONE AIRPARK']
ws[(ws['name'].isin(duplicateName))]

Unnamed: 0,Key,name,lon_w,lat_w
19492,720383_53847,WHITING FIELD NAVAL AIR STATI,-87.023,30.704
166418,722226_93841,WHITING FIELD NAVAL AIR STATI,-87.017,30.717
219704,749048_00415,KEYSTONE AIRPARK,-82.048,29.845


In [6]:
# Replace values with NaN
no_valid = [99.99000, 999.900000, 9999.900000, 99999.000000]
ws = ws.replace(99.99000,np.nan)
ws = ws.replace(999.9000000,np.nan)
ws = ws.replace(9999.900000,np.nan)
ws = ws.replace(99999.000000,np.nan)

## Read Tile Data

In [7]:
ftileslarge = 'scale_up_F_satellite_key_county.csv'
dftiles_large = pd.read_csv(attrs+ftileslarge)
dftiles_large.head(1)

Unnamed: 0.1,Unnamed: 0,name,imagenum,label,lon,lat,start_date,geometry,AnnualCrop,Forest,...,Pasture,PermanentCrop,Residential,River,SeaLake,prediction,prediction_name,county_fp,Key,County
0,174,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,0,-87.624327,30.880381,2022-05-06,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,...,1.376721e-14,3.401483e-10,6.664154e-13,0.054241,0.933093,9,SeaLake,33,033_174,Escambia


## Merge Tile and Weather for Distance Calculation

`result = pd.merge(left, right, how="left", on=["key1", "key2"])`

**Reference:**
https://pandas.pydata.org/docs/user_guide/merging.html

In [8]:
dflarge = pd.merge(dftiles_large,ws, how="cross")
print(len(dflarge))
dflarge.head(1)

45228393


Unnamed: 0.1,Unnamed: 0,name_x,imagenum,label,lon,lat,start_date,geometry,AnnualCrop,Forest,...,SeaLake,prediction,prediction_name,county_fp,Key_x,County,Key_y,name_y,lon_w,lat_w
0,174,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,0,-87.624327,30.880381,2022-05-06,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,...,0.933093,9,SeaLake,33,033_174,Escambia,997351_99999,NAPLES,-81.8,26.12


## Calculate The Distance

In [10]:
dflarge['Distance'] = dflarge.apply(lambda row: 
                                    haversine_distance(row['lat'], row['lon'], 
                                                       row['lat_w'], row['lon_w'], 
                                                       earth_radius=3963.19), axis=1)
dflarge.head(2)

Unnamed: 0.1,Unnamed: 0,name_x,imagenum,label,lon,lat,start_date,geometry,AnnualCrop,Forest,...,prediction,prediction_name,county_fp,Key_x,County,Key_y,name_y,lon_w,lat_w,Distance
0,174,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,0,-87.624327,30.880381,2022-05-06,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,...,9,SeaLake,33,033_174,Escambia,997351_99999,NAPLES,-81.8,26.12,483.338519
1,174,0-2022-05-06-2022-07-05-200.00-0.jpeg_4_34.jpeg,0,0,-87.624327,30.880381,2022-05-06,POLYGON ((-87.62718590224753 30.87752176645508...,0.003029,0.009086,...,9,SeaLake,33,033_174,Escambia,747960_99999,AVON PARK GUNNERY RANGE,-81.33,27.65,440.543205


## Find the Minimum Distances within Tile and Weather Station

`df.groupby('Company')['MPG'].agg('min')`  
`df.groupby('Company')[['MPG', 'EngineSize']].agg('min')`

**Reference:**
https://datascienceparichay.com/article/pandas-groupby-minimum/

In [11]:
# Find the rows with the minimum Distance for each Key_x
dfmin1 = dflarge.groupby(['Key_x'])['Distance'].min().to_frame()
print(len(dfmin1))

# Select only the rows with the minimum
keysL = list(dfmin1.index)
minD = list(dfmin1['Distance'])
dfF1 = dflarge[((dflarge['Key_x'].isin(keysL)) &  (dflarge['Distance'].isin(minD)))]

407463


**Reference Duplicates:**  
https://sparkbyexamples.com/pandas/pandas-get-list-of-all-duplicate-rows/#:~:text=Pandas%20DataFrame.,multiple%20columns%20or%20all%20columns.

In [12]:
#Checking for Duplicates
df2 = dfF1[dfF1["Key_x"].duplicated()==True]
dup1 = df2["Key_x"].unique()
# dup1 = ['1082_0_1','1083_0_1', '2406_0_1', '2459_0_1', '2463_0_1', '2737_0_1', '3294_0_2', '3294_1_0', '556_0_1']
dfF1[(dfF1['Key_x'].isin(dup1))]

Unnamed: 0.1,Unnamed: 0,name_x,imagenum,label,lon,lat,start_date,geometry,AnnualCrop,Forest,...,prediction,prediction_name,county_fp,Key_x,County,Key_y,name_y,lon_w,lat_w,Distance


In [13]:
dfF1.drop_duplicates(subset=['Key_x'], inplace=True)
len(dfF1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfF1.drop_duplicates(subset=['Key_x'], inplace=True)


407463

In [None]:
# dfF1["Distance"].describe()

In [14]:
dfF1.to_csv(datdir+"scale_up_F_tiles_ws_pair.csv", index=False)

In [None]:
dfF1.head()

## Get Pair for Small Tile

### Read Small Tile Data

In [None]:
ftilesmall = 'smalltiles.csv'
dftiles_small = pd.read_csv(datdir+ftilesmall)
dftiles_small["Key"] = dftiles_small.apply(lambda row: tilekey(row), axis=1)
dftiles_small = dftiles_small[["Key", "lon", "lat", "County", "CountyFP"]]
dftiles_small.columns = ["Key", "lon_t", "lat_t", "County", "CountyFp"]

print(len(dftiles_small))
dftiles_small.head(2)

### Merge Tile and Weather for Distance Calculation

In [None]:
dfsmall = pd.merge(dftiles_small,ws, how="cross")
print(len(dfsmall))
dfsmall.head(1)

### Calculate The Distance

In [None]:
dfsmall['Distance'] = dfsmall.apply(lambda row: 
                                    haversine_distance(row['lat_t'], row['lon_t'], 
                                                       row['lat_w'], row['lon_w'], 
                                                       earth_radius=3963.19), axis=1)

### Find the Minimum Distances within Tile and Weather Station

In [None]:
dfminSmall = dfsmall.groupby(['Key_x'])['Distance'].min().to_frame()

In [None]:
keysL = list(dfminSmall.index)
minD = list(dfminSmall['Distance'])
dfF2 = dfsmall[((dfsmall['Key_x'].isin(keysL)) &  (dfsmall['Distance'].isin(minD)))]
print(len(dfF2))
dfF2.head(2)

In [None]:
#Checking for Duplicates
df2 = dfF2[dfF2["Key_x"].duplicated()==True]
dup1 = dfF2["Key_x"].unique()
# dup1 = ['1082_0_1','1083_0_1', '2406_0_1', '2459_0_1', '2463_0_1', '2737_0_1', '3294_0_2', '3294_1_0', '556_0_1']
dfF2[(dfF2['Key_x'].isin(dup1))]

In [None]:
dfF2.drop_duplicates(subset=['Key_x'], inplace=True)
len(dfF2)

In [None]:
dfF2["Distance"].describe()

In [None]:
dfF2.to_csv(datdir+"scale_up_small_tile_ws_pair.csv", index=False)

In [None]:
len(dfF2["Key_y"].unique())