In [2]:
import pandas as pd
import matplotlib.pyplot as plt

setup dataframes

In [6]:
import os 
os.chdir('new_data')
!ls

KHNXd20170115t0000.cfradial KHNXd20170127t0400.cfradial
KHNXd20170115t0200.cfradial KHNXd20170127t0600.cfradial
KHNXd20170115t0400.cfradial KHNXd20170127t0800.cfradial
KHNXd20170115t0600.cfradial KHNXd20170127t1000.cfradial
KHNXd20170115t0800.cfradial KHNXd20170127t1200.cfradial
KHNXd20170115t1000.cfradial KHNXd20170127t1400.cfradial
KHNXd20170115t1200.cfradial KHNXd20170127t1600.cfradial
KHNXd20170115t1400.cfradial KHNXd20170127t1800.cfradial
KHNXd20170115t1600.cfradial KHNXd20170127t2000.cfradial
KHNXd20170115t1800.cfradial KHNXd20170127t2200.cfradial
KHNXd20170115t2000.cfradial KHNXd20170128t0000.cfradial
KHNXd20170115t2200.cfradial KHNXd20170128t0200.cfradial
KHNXd20170116t0000.cfradial KHNXd20170128t0400.cfradial
KHNXd20170116t0200.cfradial KHNXd20170128t0600.cfradial
KHNXd20170116t0400.cfradial KHNXd20170128t0800.cfradial
KHNXd20170116t0600.cfradial KHNXd20170128t1000.cfradial
KHNXd20170116t0800.cfradial KHNXd20170128t1200.cfradial
KHNXd20170116t1000.cfradial KHNXd20170128t1400.c

In [31]:
radar_df = pd.read_pickle('refl_final.pkl')
sr_df = pd.read_pickle('../sr_df.pkl')

In [32]:
# 41585
radar_df.drop_duplicates(inplace=True)

In [33]:
# only keep KHNX sensors, and convert everything to metric
srr_df = sr_df[sr_df['r_closest'] == 'khnx'].reset_index()
srr_df.drop('index', axis=1, inplace=True)

srr_df['elev_m'] = srr_df['elev_ft'] * 0.305
srr_df['today_mm'] = srr_df['today_in'] * 25.4
srr_df['r_elev_delta_m'] = srr_df['r_elev_delta_ft'] * 0.305

del srr_df['elev_ft'], srr_df['today_in'], srr_df['r_elev_delta_ft']

srr_grouped = srr_df.groupby('st_code')

Setup feature columns, drawing from https://www.kaggle.com/c/how-much-did-it-rain-ii/data
for now, start with

- `Id`
- `minutes_past` - convert to `hours_past`
- `radar_dist_km`
- `Ref` -  Radar reflectivity in dbz
- `RhoHV` - Correlation coefficient (unitless)
- `Kdp` - Specific differential phase (deg/km)

get snow mm deltas for each day pair

In [35]:
srr_df_backup = srr_df.copy()
all_sensors = []
for g in srr_grouped.groups.keys():
    a_sensor = srr_grouped.get_group(g).copy()
    a_sensor['expected_mm'] = a_sensor['today_mm'].diff()
    #a_sensor['should_be_1day'] = a_sensor['date'].diff()
    all_sensors.append(a_sensor[1:])

sr_df = pd.concat(all_sensors)
del srr_grouped

In [36]:
# email correspondence with Randall Osterhuber snowlab@att.net. he believes readings are at 8 am
sensor_reading_time = 8.0
sr_df['date'] = sr_df['date'] + pd.Timedelta(str(sensor_reading_time) + 'H')

In [37]:
# match up radar readings with expected_mm
# compare time, if timedelta is <24 hours then include it
radar_grouped = radar_df.groupby('st_code')
sr_grouped = sr_df.groupby('st_code')

radar_expected = []
for sensor in radar_grouped.groups.keys():
    # we have radar readings at different times, associated with a sensor
    radar_readings = radar_grouped.get_group(sensor)
    
    # we need snow readings for a sensor sensor
    snow_readings = sr_grouped.get_group(sensor)
    
    # select all radar readings with 24H before the snow reading
    for i,row in enumerate(snow_readings.iterrows()):
        snow_reading = row[1]
        time_since_radar  = snow_reading['date'] - radar_readings['time']
        
        # we will be assigning a value, expected_mm, to this, hence the copy
        selected = (time_since_radar < pd.Timedelta('24H')) & (time_since_radar > pd.Timedelta("0H"))
        radar_select = radar_readings.loc[selected].copy()
        radar_select['expected_mm'] = snow_reading['expected_mm']
        # round to 15 minute for grouping
        radar_select['time_until_reading'] = time_since_radar.loc[selected].apply(lambda td: td.round('15MIN'))
        radar_select['time_until_reading_hr'] = \
            radar_select['time_until_reading'].apply(lambda td: round(td.total_seconds()/3600,1))
        radar_select['day_id'] = i
        radar_expected.append(radar_select)


final cleanup

In [38]:
radar_expected = pd.concat(radar_expected).reset_index().drop('index', axis=1)
radar_expected['gate_time']= radar_expected['time']
radar_expected['alt_m'] = round(radar_expected['alt_m']/1000)*1000
radar_exp = radar_expected.drop_duplicates()

del radar_exp['time']
del radar_expected

### Finalize radar data

In [58]:
# kill reflectivity values below cutoff. try this later, let NN determine features for now
# noise_cutoff = 12 # dbz
# radar_expected_backup = radar_expected.copy()
# radar_expected[radar_expected['refl'] < noise_cutoff]

In [39]:
# merge with sr_df to get sensor info like elevation
radar_exp = radar_exp.merge(sr_df[['st_code', 'elev_m', 'r_dist_km', 'r_elev_delta_m']].drop_duplicates(), \
                                    on='st_code')

In [40]:
rg = radar_exp.groupby(['st_code', 'day_id', 'time_until_reading_hr'])
#rg = radar_expected.groupby(['st_code', 'day_id'])

In [41]:
len(rg.groups)

9563

### Consolidate altitudes 
each group should become a row

In [42]:
%%time
rd_rows = []
for g in rg.groups.keys():
    a_group = rg.get_group(g)
    features = ('refl', 'velocity', 'xcorr_ratio', 'difphase')
    new_row = a_group.iloc[0].drop(['alt_m', 'refl', 'velocity', 'xcorr_ratio', 'difphase'])
    for row in a_group.iterrows():
        row = row[1]
        alt_label = str(int(row['alt_m']))
        for f in features:
            label = f + '_' + alt_label
            new_row[label] = row[f]
    rd_rows.append(new_row)
#     break

CPU times: user 2min 47s, sys: 2.5 s, total: 2min 50s
Wall time: 2min 54s


In [43]:
composite_df = pd.DataFrame.from_dict(rd_rows)
composite_df.reset_index(inplace=True, drop=True)

In [47]:
composite_df[['day_id','time_until_reading_hr','expected_mm']]

Unnamed: 0,day_id,time_until_reading_hr,expected_mm
0,13,0.0,5.08
1,13,2.0,5.08
2,13,4.0,5.08
3,13,6.0,5.08
4,13,8.0,5.08
5,14,1.8,5.08
6,14,2.0,5.08
7,14,4.0,5.08
8,14,6.0,5.08
9,14,7.8,5.08


In [52]:
os.chdir('..')
!ls

KHNXd20170115t0000-err.txt process_model_data.ipynb
KHNXd20170115t0200-err.txt process_radar_data.ipynb
composite_df.pkl           process_sensor_data.ipynb
[1m[36mdata_temp[m[m                  rdr_coords.pkl
get_radar_data.ipynb       [1m[36mref[m[m
get_snow_data.ipynb        refl_new.pkl
googleapikey.txt           results_exploration.ipynb
khnx_radar.png             rf_results.pkl
kojak-pres.key             rnn_results.pkl
model.png                  snow_in_chart.png
model_ens_snow_data.ipynb  snow_scraped_df.pkl
model_nn_snow_data.ipynb   sr_df.pkl
[1m[36mnew_data[m[m


In [53]:
composite_df.to_pickle('composite_df.pkl')