### Final imputation code

In [None]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

### Utility functions

In [None]:
# use haversine formula to calculate shortest distance over the earth's surface
# https://en.wikipedia.org/wiki/Haversine_formula

def latlong_distance(lat1, long1, lat2, long2):
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
    lat_delta = lat2 - lat1
    long_delta = np.radians(long2 - long1)
    #a = sin^2(latitude delta / 2) + cos latitude 1 * cos latitude 2
    #* sin^2(longitude delta / 2)
    a = np.sin(lat_delta / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(long_delta / 2) ** 2

    #c = 2 * arcsin(a^0.5)
    c = 2 * np.arcsin(np.sqrt(a))
    
    #d = R * c
    R = 6371 #earth's radius = 6371km
    d = R * c
    return d

In [None]:
def closest_n_search(latlong, latlongs, **kwargs):
    # find n closest latlongs to the given coordinates and corresponding weights
    # input = tuple of target coordinates, list of tuples [(lat, long)], n
    # output = dict, key = weights, values = tuple(lat, long)
    target_lat, target_long = latlong
    min_latlong = tuple()
    min_distance = float('inf')
    dist = {}
    
    # evaluate each lat long pairs
    for (lat, long) in latlongs:
        # only evaluate lat long coordinates within a certain range
        if np.abs(lat - target_lat) <= 10 and np.abs(long - target_long) <= 10:
            dist = latlong_distance(target_lat, target_long, lat, long)

            if len(min_latlong) < 1 or dist <= min_distance:
                min_latlong = (lat, long)
                min_distance = dist
    return min_latlong, min_distance

### Import Data

In [None]:
data = pd.read_hdf('data_before_impute.h5')

In [None]:
data = pd.read_hdf('full.h5')

### Pre-calculate closest n neighbors and corresponding weights of all coordinates

In [None]:
# Pre-compute n nearest neighbor based on distance
#df = data

# Select unique lat long combinations from the data
latlongs = data[['grid_lat', 'grid_lon']].values
latlongs = [(ll[0], ll[1]) for ll in latlongs]
latlongs = list(set(latlongs))
len(latlongs)

latlong_matches = {}
for i in range(len(latlongs)):
    latlong_match = closest_n_search(latlongs[i], latlongs[:i] + latlongs[i + 1:])
    latlong_matches[latlongs[i]] = latlong_match
    # return dict
    # key = (target_lat, target_long), value = (n_lat, n_long, dist)
        
print("example of coordinates at a similar distance to multiple points, target point :(36.0, -86.5), closest neighbor:",\
latlong_matches[(36.0, -86.5)])

dist_closest_n = [i[1] for i in list(latlong_matches.values())]

print(f"Average distance with geo-imputing ref (km): {np.mean(dist_closest_n)}")
print(f"Min. distance with geo-imputing ref (km): {np.min(dist_closest_n)}")
print(f"Max. distance with geo-imputing ref (km): {np.max(dist_closest_n)}")

#"Distribution of distance to closest neighbor (km)", plt.hist(dist_closest_n, bins=[0,100,200,300,400,500,600,700,800])

In [None]:
sns.set(style="darkgrid")
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
ax = sns.distplot(dist_closest_n,bins=9,kde=False)
ax.set(xlabel='Distance to closest FC (km)', ylabel='Count')
fig.suptitle("Histogram of distance to closest neighbor of FCs", fontsize=15)
#plt.show()
fig.savefig('Figure_3.png')

### Resize data to ensure there are data records for all dates, zip5 and time (4 times a day)

In [None]:
for index, row in data.iterrows():
    f = (data['date_key'] == row['date_key']) & (data['zip5'] == row['zip5'])
    if pd.isna(row['Date']):
        data.at[index,'Date'] = row['date_key']
    if pd.isna(row['Time']):
        missing_times = list(set(time) - set(data[f]['Time']))
        data.at[index,'Time'] = missing_times[0]

### Drop columns with > 80% missing data

In [None]:
data = data.drop(columns = ["ForecastRange","Categorical_Snow_surface",\
                              "Composite_reflectivity_entire_atmosphere",
                              "Graupel_snow_pellets_hybrid",\
                              "Graupel_snow_pellets_isobaric",\
                              "Snow_mixing_ratio_hybrid",\
                              "Snow_mixing_ratio_isobaric",\
                              'Geopotential_height_potential_vorticity_surface', \
                              'u_component_of_wind_potential_vorticity_surface',\
                              'v_component_of_wind_potential_vorticity_surface',\
                              'Categorical_Ice_Pellets_surface',\
                              'Ice_water_mixing_ratio_hybrid',\
                              'Ice_water_mixing_ratio_isobaric',\
                              'Precipitation_rate_surface',\
                              'Vertical_velocity_geometric_isobaric',\
                              'Ice_growth_rate_altitude_above_msl',\
                              'Land_sea_coverage_nearest_neighbor_land1sea0_surface',\
                              'Pressure_potential_vorticity_surface',\
                              'Temperature_potential_vorticity_surface',\
                              'Vertical_Speed_Shear_potential_vorticity_surface',\
                              'Rain_mixing_ratio_hybrid',\
                              'Total_cloud_cover_isobaric',\
                              'Cloud_mixing_ratio_hybrid',\
                              'Categorical_Freezing_Rain_surface',\
                              'Categorical_Rain_surface', 'Visibility_surface'])

### Spatial Imputation

In [None]:
def spatial_impute(lat, long, date, time, colname, data):
    n_latlong, dist = latlong_matches[(lat, long)]
    #print('\n=================')
    #print("target point:" , "(" ,lat,long, "), closest neighbor:", n_latlong, ", distance:", dist)
    b = data[(data['grid_lat'] == n_latlong[0]) & (data['grid_lon'] == n_latlong[1])\
        & (data['date_key'] == date) & (data['Time'] == time)]
    #print('\nData of closet neighbor\n',b,'\n')
    #print(np.sum(b))
    if b.shape[0] == 0:
        output = np.nan
    else: 
        output = b[colname].values[0]
    #print('imputed value:', output, 'distance',dist)
    #print('=================\n')
    return output, dist

In [None]:
# Code for all features to impute

cols_to_impute = ['Snow_depth_surface', 'Haines_Index_surface',\
'u_component_of_wind_altitude_above_msl', 'v_component_of_wind_altitude_above_msl',\
'Soil_temperature_depth_below_surface_layer', 'Temperature_altitude_above_msl',\
'Volumetric_Soil_Moisture_Content_depth_below_surface_layer', 'Field_Capacity_surface', \
'Water_equivalent_of_accumulated_snow_depth_surface', 'Wilting_Point_surface']

#Find index of column to impute
for colname in cols_to_impute:
    #colname = cols_to_impute[0]
    colind = list(data.columns).index(colname)
    print('Impute for column', colname)

    for i in range(data.shape[0]):
        data_target = data.iloc[i,:]
        #print(data_target)
        if np.isnan(data_target[colname]): #If data is missing, we need to impute
            print(i, '=>impute')
            lat, long, date, time = data_target['grid_lat'], data_target['grid_lon'], data_target['date_key'], data_target['Time']
            i_value, dist = spatial_impute(lat, long, date, time, colname, data)

            #Sucessful spatial imputation is when the imputed value is non-nan and the distance to the closest neighbor is < 200
            #The code currently set imputed value to Nan if the closest neighbor does not have value. 
            #However, it still returns a real value even when the distance to the closest neighbor is > 200. Code below is to fix it:
            if dist > 200:
                i_value = np.nan

            #Put the imputed values and flags inside the original dataframe:
            data.iloc[i,-3] = 1 #Update the column "any_impute_col" value to 1
            data.iloc[i,colind] = i_value
        #else:
        #    print(i, '=> no impute')

#### Temporal Interpolation/ Imputation

In [None]:
#data = pd.read_hdf('imputeddata.h5')

In [None]:
# Time-series interpolation
pd.to_datetime(data['datetime'])
data.set_index('datetime')
data.head()
data_interpolate = data.groupby(['grid_lat', 'grid_lon']).apply(lambda x: x.interpolate(method='linear', limit_direction = 'both'))
