In [1]:
import os
import pandas as pd
import numpy as np
import hashlib
from tqdm import tqdm

def reproject(latitude, longitude):
        """Returns the x & y coordinates in meters using a sinusoidal projection"""
        earth_radius = 6371009 # in meters
        lat_dist = np.pi * earth_radius / 180.0

        y = latitude*lat_dist
        x = longitude * lat_dist * np.cos(np.radians(latitude))
        return x, y

    
def hash_it(s):
    return hashlib.sha224(s.encode('utf-8')).hexdigest()    


def Bin_and_Split(d):

    stations = pd.read_csv('ghcnd-stations.csv', index_col=0)
    
    # Reduce the stations to the ones that exist in ghcn_df (from GHCN-D Downloader.ipynb)
    stations = stations[stations.index.isin(stations_with_data)]

    stations['x'], stations['y'] = reproject(stations['LATITUDE'], stations['LONGITUDE'])

    step_size = (stations['x'].max()-stations['x'].min())/d

    # Bin stations 
    labels = [ "{0} to {1}".format(i, i + step_size) for i in np.arange(stations['x'].min(), stations['x'].max(), step_size) ]
    stations['x_group'] = pd.cut(stations.x, np.arange(stations['x'].min(), stations['x'].max()+step_size, step_size), right=False, labels=labels)
    stations['y_group'] = pd.cut(stations.y, np.arange(stations['x'].min(), stations['x'].max()+step_size, step_size), right=False, labels=labels)
    stations['xy_group'] = stations['x_group'].astype('str') + ', ' + stations['y_group'].astype('str')
    
    # Find hashing from xy_group
    stations['hash']=stations['xy_group'].apply(hash_it)
    
    # Map hashing to ghcn data, sort for faster querying
    ghcn2 = ghcn_df.copy()
    ghcn2['hash'] = stations['hash']
    ghcn2 = ghcn2.sort_values('hash')
    ghcn3 = ghcn2.drop('hash', axis=1)

    # Create the folder that will hold the csvs for that specific step size
    new_folder = 'BinnedCsvs_d{}'.format(d)
    os.mkdir(new_folder)
    
    # For each hash, select from ghcn data and save to csv
    sorted_hashes = np.sort(stations['hash'].unique())
    for hashid in sorted_hashes:
        left, = ghcn2['hash'].searchsorted(hashid, 'left')
        right, = ghcn2['hash'].searchsorted(hashid, 'right')
        df_by_bin = ghcn3.iloc[left:right]
        df_by_bin.to_csv('./'+new_folder+'/{}.csv'.format(hashid))

    stations.to_csv('BinSize_d{}.csv'.format(d))

In [2]:
# These are the step sizes we want to use
step_sizes = [400, 200, 100, 50, 25, 18, 12.5]

# Load ghcn data
ghcn_df = pd.read_hdf('GHCND_10Year.h5')

# Pull out the set of stations that have passed the initial requirements from `GHCN-D Downloader`
stations_with_data = set(ghcn_df.index)

for s in tqdm(step_sizes):
    Bin_and_Split(s)

100%|██████████| 3/3 [2:33:46<00:00, 3120.31s/it]
