In [68]:
import numpy as np
import pandas as pd
import datetime as dt
import time


def str_to_utc(s):
    return int(time.mktime(dt.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').utctimetuple()))

def utc_to_str(t):
    return dt.datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')


to_drop = ['agency_cd', 'site_no', 'tz_cd', 'GH_Inst_cd', 'year', 'month', 'week', 'yday', 'hour', 'byhour',
           'byday', 'byweek', 'bymonth']

data = pd.read_csv("gh.csv")
data['ts_tmp'] = data['dateTime'].apply(str_to_utc)
data.drop(to_drop, axis=1, inplace=True)

t = data['ts_tmp'].values
idx = pd.DataFrame(t[0] + 900*np.arange((t[-1] - t[0]) // 900 + 1), columns=['timestamp'])
D = idx.merge(data, how='left', left_on='timestamp', right_on='ts_tmp')
D.drop('ts_tmp', axis=1, inplace=True)

D.columns = ['timestamp', 'date', 'gh_raw']
D['hour_id'] = D['timestamp'].values // 3600
D['day_id']  = (D['timestamp'].values + 2*3600) // 86400
D['week_id'] = (D['timestamp'].values - 86400*4 + 2*3600) // (86400*7)\


def find_skips(x):
    skips = list()
    start = -1
    end = -1
    for i in range(x.size):
        if np.isnan(x[i]) and not np.isnan(x[i-1]):
            start = i-1
        if not np.isnan(x[i]) and np.isnan(x[i-1]):
            end = i
            skips.append([start, end])
    return skips

skips = find_skips(D['gh_raw'].values)


def bezier_interpolator(x_a, y_a, dy_a, x_b, y_b, dy_b, x):
    if x.size < 4:
        By = 0.5*(y_a + y_b) * np.ones(x.size)
    
    elif dy_a * dy_b > 0:
        # cubic bezier curve
        x_1 = x[x.size // 3]  
        x_2 = x[x.size - x.size // 3] 
        y_1 = y_a + dy_a*(x_1 - x_a)        
        y_2 = y_b + dy_b*(x_2 - x_b)        
        t = np.zeros(x.size)
        for i in range(x.size):
            r = np.roots([x_b - 3*x_2 + 3*x_1 - 1*x_a,
                                3*x_2 - 6*x_1 - 3*x_a,
                                        3*x_1 - 3*x_a,
                                                1*x_a - x[i]])
            idx = np.where(np.logical_and(r <= 1, r >= 0))[0][0]
            t[i] = r[idx]
        By = (1-t)**3 * y_a + 3*t*(1-t)**2 * y_1 + 3*(1-t)*t**2 * y_2 + t**3 * y_b
        
    else:
        # quadratic bezier curve
        x_1 = (y_b - y_a + dy_a*x_a - dy_b*x_b) / (dy_a - dy_b)
        y_1 = y_a + dy_a*(x_1 - x_a)        
        t = np.zeros(x.size)
        for i in range(x.size):
            r = np.roots([x_b - 2*x_1 - 1*x_a, 
                                2*x_1 - 2*x_a, 
                                        1*x_a - x[i]])
            idx = np.where(np.logical_and(r <= 1, r >= 0))[0][0]
            t[i] = r[idx]
        By = (1-t)**2 * y_a + 2*t*(1-t)*y_1 + t**2 * y_b
        
    return By


def impute(z):
    y = z.copy()
    dy = y[1:] - y[:-1]
    for skip in skips:
        a = skip[0]
        x_a = 0
        y_a = y[a]
        dy_a = dy[a-16:a].mean()
        b = skip[1]    
        x_b = b - a
        y_b = y[b]    
        dy_b = dy[b+2:b+18].mean()
        x = np.arange(b - a)
        y[a:b] = np.round(100*bezier_interpolator(x_a, y_a, dy_a, x_b, y_b, dy_b, x)) / 100
    return y
  
    
D['gh'] = impute(D['gh_raw'].values)

tmp = pd.DataFrame(D.groupby('hour_id')['gh'].mean())
tmp.columns = ['gh_hour']
D = D.merge(tmp, left_on='hour_id', right_index=True)

tmp = pd.DataFrame(D.groupby('day_id')['gh'].mean())
tmp.columns = ['gh_day']
D = D.merge(tmp, left_on='day_id', right_index=True)

tmp = pd.DataFrame(D.groupby('week_id')['gh'].mean())
tmp.columns = ['gh_week']
D = D.merge(tmp, left_on='week_id', right_index=True)

D.to_csv('data.csv')