In [37]:
import time
import numpy as np
from scipy import spatial as spatial
import pandas as pd
import random
import math


In [38]:
#load data from raw station data file
datContent = [i.strip().split() for i in open("../nsta24_sac.dat").readlines()]
pre_df=pd.DataFrame(datContent, columns=["station name","lon","lat","?","?","?","agency","CSMT","?","?","?","?","?","?","date","?","?","?","?","?","?","?","?","?","?","?","?","?","?","?","?"])

#find stations that are not CSMT
df=pre_df.drop(columns=['?'])
df=df[df["CSMT"]!="CSMT"]

pd.set_option('display.max_rows', df.shape[0]+1)
non_CSMT=df.sort_values(by=['lon', 'lat'])




In [39]:
station1=non_CSMT[pd.to_numeric(non_CSMT["lon"])==120.9591]
station2=non_CSMT[pd.to_numeric(non_CSMT["lon"])==121.1412]
station2=non_CSMT[pd.to_numeric(non_CSMT["lat"])==23.7537]
station3=non_CSMT[pd.to_numeric(non_CSMT["lon"])==121.2724]

print(station1,station2,station3)


     station name       lon      lat agency CSMT      date
1062          YUS  120.9591  23.4873    CWB  SMT  20110907
1063          YUS  120.9591  23.4873    CWB  SMT  20110907     station name       lon      lat agency  CSMT      date
916         VWDT  121.1412  23.7537    IES  BATS  20090716     station name       lon      lat agency CSMT      date
953          WHF  121.2724  24.1434    CWB  SMT  20081014
954          WHF  121.2724  24.1434    CWB  SMT  20081014


In [40]:
result_station[0][0]

121.03729

In [41]:
#load data of CSMT data
grid_value_and_cords=[i.strip().split() for i in open("../heatmap_real.dat").readlines()]
grid_value_and_cords=pd.DataFrame(grid_value_and_cords,columns=['lon','lat','dist'])
grid_value_and_cords=grid_value_and_cords.transpose().drop('dist').transpose()
grid_value_and_cords=grid_value_and_cords.sort_values(by=['lon', 'lat'])
pd.set_option('display.max_rows', grid_value_and_cords.shape[0]+1)

#load CSMT stations that are in between 120,122,22.5,25
result=grid_value_and_cords[pd.to_numeric(grid_value_and_cords["lon"])>120]
result=result[pd.to_numeric(result["lon"])<122]
result=result[pd.to_numeric(result["lat"])>22.5]
result=result[pd.to_numeric(result["lat"])<25]
result_grid=result.sort_values(by=['lon', 'lat'])
result_grid["lon"]=pd.to_numeric(result_grid["lon"])
result_grid["lat"]=pd.to_numeric(result_grid["lat"])
result_grid=result_grid.to_numpy()
len(result_grid)

7821

In [42]:
#find the non-CSMT station of interest
df=non_CSMT.sort_values(by=['lon', 'lat'])

result=df[pd.to_numeric(df["lon"])>120.95]
result=result[pd.to_numeric(result["lon"])<121.29]
result=result[pd.to_numeric(result["lat"])>23.2]
result=result[pd.to_numeric(result["lat"])<24.3]

result_station=result.sort_values(by=['lon', 'lat'])
result_station=result_station.drop_duplicates(subset=['lon','lat'], keep='last')
result_station=result_station[["lon","lat"]]
result_station.to_csv('../point_of_interest.dat', sep=' ', header=None,index=None)
result_station["lon"]=pd.to_numeric(result_station["lon"])
result_station["lat"]=pd.to_numeric(result_station["lat"])
result_station=result_station.to_numpy()
print(result_station[1])
print(result_station[3])
print(result_station[9])


[120.9591  23.4873]
[121.1412  23.7537]
[121.2724  24.1434]


In [43]:
#find exisitng CSMT stations in the region
# read dat file 
datContent = [i.strip().split() for i in open("../nsta24_sac.dat").readlines()]
# write it as a dataframe
df=pd.DataFrame(datContent,
                columns=["station name","lon","lat","?","?","?","agency","CSMT","?","?","?","?","?","?","date","?","?","?","?","?","?","?","?","?","?","?","?","?","?","?","?"]
               )
# filter CSMT
new_df=df[df["CSMT"]=="CSMT"]
new_df.to_csv('../existing_station.dat', sep=' ', header=None,index=None)
exist_lon=pd.to_numeric(new_df["lon"])
exist_lat=pd.to_numeric(new_df["lat"])
exist_lon_lat=np.array([exist_lon,exist_lat]).transpose()

#merge overlapping stations
#define the area of interest
lon_start = 120
lon_end = 122
lat_start= 22.5
lat_end= 25

step_count=0.025

#merge exisiting stations in each grid
s=pd.DataFrame(exist_lon_lat,columns=["lon","lat"])
s1=pd.cut(s["lon"], np.arange(lon_start, lon_end, step_count))
s2=pd.cut(s["lat"], np.arange(lat_start, lat_end, step_count))
result=s.groupby([s1,s2]).mean()

lon_lat_nan=np.array([result["lon"],result["lat"]]).transpose()

#remove NaN
lon_lat_nan = lon_lat_nan[np.logical_not(np.isnan(lon_lat_nan))]
lon=np.array([])
lat=np.array([])

for i in range(len(lon_lat_nan)):
    if i%2==0:
        lon=np.append(lon,lon_lat_nan[i])
    else:
        lat=np.append(lat,lon_lat_nan[i])
lon_lat= np.concatenate(([lon],[lat])).transpose()
len(lon_lat)


211

In [44]:
#define kd tree function
def do_kdtree(combined_x_y_arrays,points):
    mytree = spatial.cKDTree(combined_x_y_arrays)
    dist, indexes = mytree.query(points)
    return dist, indexes


In [45]:
#define converter: lon lat to x y coords
def converter(lon2,lat2):
    lon1=lon_start
    lat1=lat_start
    dx = (lon2-lon1)*40000*math.cos((lat1+lat2)*math.pi/360)/360
    dy = (lat2-lat1)*40000/360
    return(np.array([dx,dy]))


In [331]:
#compute different distance data while adding different stations of interest along with CSMT data
for j in range(len(result_station)):
    
    start = time.time()
    
    #define the inputs
    points=result_grid.transpose()
    result_station_add=lon_lat
    #transpose and list points
    points_list = list(points.transpose())
    
    #adding each individual station of interest
    result_station_add=np.append(result_station_add,result_station[j])
    
    #initialize storages
    final_storage=np.array([])
    lon=np.array([])
    lat=np.array([])
    
    #dimensionalizing after adding the stations of interest
    for x in range(len(result_station_add)):
        if x%2==0:
            lon=np.append(lon,result_station_add[x])
        else:
            lat=np.append(lat,result_station_add[x])
    result_station_add= np.concatenate(([lon],[lat])).transpose()

    iteration=1 #define how many neighbors of interest
    
    #match the station data with every grid point
    for i in range(len(points_list)):

        #re-initialize the storage and input
        combined_x_y_arrays=result_station_add
        points_list = list(points.transpose())
        storage=np.array([])

        
        #find the distance between the nearest k stations compared with the grid point
        for k in range(iteration):
            #run do_kd_tree
            kd_result = do_kdtree(combined_x_y_arrays,points_list)
            
            #compile the results of the do_kd_tree
            result_compiler=pd.DataFrame(kd_result, index=["dist", "indexes"]).transpose()
            
            #store the nearest point
            #storage=np.append(storage, result_compiler["dist"][i])
            
            #convert lon/lat to the x/y
            
            lon_end,lat_end=combined_x_y_arrays[int(result_compiler["indexes"][i])]
            
            #find the coords of the grid point
            individual_x,individual_y=points_list[i]
            
            #calculate the distance
            dist=np.linalg.norm(converter(lon_end,lat_end)-converter(individual_x,individual_y))
            
            #store the dist
            storage=np.append(storage, dist)
            
            #remove the nearest point
            combined_x_y_arrays=np.delete(combined_x_y_arrays, int(result_compiler["indexes"][i]),0)
        
        #calculate the average_distance (applicable when k>1)
        average_distance=np.array(sum(storage)/iteration)
        #store the average data
        final_storage=np.append(final_storage,average_distance)

    
    #put the relevant data into dataframe
    grid_value=pd.DataFrame(data=final_storage.reshape(1,-1), index=["dist"])
    cords=pd.DataFrame(data=points,index=["lon","lat"])
    grid_value_and_cords=cords.append(grid_value)
    grid_value_and_cords=grid_value_and_cords.transpose()
    
    #define which non_CSMT station of interest
    station_title=j
    
    #output each data
    #generate a unique file name based on the id and record
    file_name="../station_specific"+str(station_title)+".dat"
    #create the CSV
    grid_value_and_cords.to_csv(file_name, sep=' ', header=None,index=None)
    
    end = time.time()
    print('Completed in: ',end-start)


IndentationError: unexpected indent (<ipython-input-331-b77c060dc4e0>, line 5)

In [46]:
result_station=np.array([[121.13584,24.06908]
])

In [35]:
result_station=np.array([[121.03729,23.42483],[121.66986,23.35745],[121.01648,23.71486], [120.45481,21.80950],[121.26462,24.06342]])

In [50]:
#compute different distance data while adding different stations of interest along with CSMT data
j=[0]
start = time.time()

#define the inputs
points=result_grid.transpose()
result_station_add=lon_lat
#transpose and list points
points_list = list(points.transpose())

#adding each individual station of interest
result_station_add=np.append(result_station_add,result_station[j])

#initialize storages
final_storage=np.array([])
lon=np.array([])
lat=np.array([])

#dimensionalizing after adding the stations of interest
for x in range(len(result_station_add)):
    if x%2==0:
        lon=np.append(lon,result_station_add[x])
    else:
        lat=np.append(lat,result_station_add[x])
result_station_add= np.concatenate(([lon],[lat])).transpose()

iteration=1 #define how many neighbors of interest

#match the station data with every grid point
for i in range(len(points_list)):

    #re-initialize the storage and input
    combined_x_y_arrays=result_station_add
    points_list = list(points.transpose())
    storage=np.array([])


    #find the distance between the nearest k stations compared with the grid point
    for k in range(iteration):
        #run do_kd_tree
        kd_result = do_kdtree(combined_x_y_arrays,points_list)

        #compile the results of the do_kd_tree
        result_compiler=pd.DataFrame(kd_result, index=["dist", "indexes"]).transpose()

        #store the nearest point
        #storage=np.append(storage, result_compiler["dist"][i])

        #convert lon/lat to the x/y

        lon_end,lat_end=combined_x_y_arrays[int(result_compiler["indexes"][i])]

        #find the coords of the grid point
        individual_x,individual_y=points_list[i]

        #calculate the distance
        dist=np.linalg.norm(converter(lon_end,lat_end)-converter(individual_x,individual_y))

        #store the dist
        storage=np.append(storage, dist)

        #remove the nearest point
        combined_x_y_arrays=np.delete(combined_x_y_arrays, int(result_compiler["indexes"][i]),0)

    #calculate the average_distance (applicable when k>1)
    average_distance=np.array(sum(storage)/iteration)
    #store the average data
    final_storage=np.append(final_storage,average_distance)


#put the relevant data into dataframe
grid_value=pd.DataFrame(data=final_storage.reshape(1,-1), index=["dist"])
cords=pd.DataFrame(data=points,index=["lon","lat"])
grid_value_and_cords=cords.append(grid_value)
grid_value_and_cords=grid_value_and_cords.transpose()

#define which non_CSMT station of interest
station_title=len(j)

#output each data
#generate a unique file name based on the id and record
file_name="../final"+str(station_title)+".dat"
#create the CSV
grid_value_and_cords.to_csv(file_name, sep=' ', header=None,index=None)

end = time.time()
print('Completed in: ',end-start)


KeyboardInterrupt: 