# Description
Open scraped json data and do EDA
# Imports

In [1]:
import json
import re
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns

# Parameters

In [34]:
alt.renderers.enable(embed_options={'theme': 'vox'})

RendererRegistry.enable('default')

In [2]:
data_path = r'../../data/ots_snowfall_data.json'

# Load Data

In [3]:
with open(data_path, 'r') as json_file:
    json_file_read = json_file.read()
    json_file_clean = json.loads(json_file_read)
    raw_df = pd.DataFrame(json_file_clean)
raw_df.tail()

Unnamed: 0,station,what_data,year,url,data
5431,Bruce Mound,base,2018,https://www.onthesnow.com/wisconsin/bruce-moun...,"{'Dec 2018': {'15': 12, '16': 12, '17': 12, '1..."
5432,Sleeping Giant,base,2018,https://www.onthesnow.com/wyoming/sleeping-gia...,"{'Dec 2018': {'14': 30, '15': 28, '16': 26, '1..."
5433,Elko SnoBowl,base,2018,https://www.onthesnow.com/nevada/elko-snobowl/...,{}
5434,Eagle Point,base,2018,https://www.onthesnow.com/utah/eagle-point/his...,"{'Dec 2018': {'28': 28, '29': 28, '30': 28, '3..."
5435,Pine Knob,base,2018,https://www.onthesnow.com/michigan/pine-knob-s...,"{'Nov 2018': {'30': 24}, 'Dec 2018': {'1': 24,..."


In [113]:
def data_exploder(df, station):
    """create dataframe of station data in tabular form
    todo: make more efficient for loading all data at once"""

    def cleaner(df, data_pt):
        """pull data out of json into columns"""
        return (pd.json_normalize(df.loc[df.what_data == data_pt, 'data'])
                .sum()
                .T.to_frame()
                .rename(columns={0: data_pt}))

    station_df = (df.query('station == @station')[['data', 'what_data']])
    base_df = cleaner(station_df, 'base')
    snowfall_df = cleaner(station_df, 'snowfall')
    station_data = pd.concat([base_df, snowfall_df], axis=1)
    station_data['station'] = station
    dt_format = "%b %Y.%d"
    station_data.index = pd.to_datetime(station_data.index, format=dt_format)
    return station_data

In [118]:
def annualizer(df):
    """grounds and aggregates by day of year"""
    df.index.name = 'timestamp'

    def day_standardizer(day):
        """moves start of year forward so ski season isn't split"""
        return (day+100) % 366
    df_by_yr = (df
                .reset_index()
                .assign(year=lambda x: x.timestamp.dt.year)
                .assign(month=lambda x: x.timestamp.dt.month)
                .assign(day=lambda x: x.timestamp.dt.day)
                .assign(dayofyr=lambda x: 
                        day_standardizer(x.timestamp.dt.dayofyear))
                .assign(ski_yr=lambda x: (x.dayofyr/365 + (x.year - 2010)).astype('int'))
                .reset_index()
                .set_index('dayofyr')
                .filter(['dayofyr', 'timestamp', 'base', 
                         'station', 'snowfall', 'ski_yr'])
                #.groupby(['dayofyr'])
                #.agg([np.mean, 'std'])
                .reset_index()
                )
    #df_by_yr.columns = df_by_yr.columns.map('_'.join)
    return df_by_yr

In [119]:
all_data_df = pd.concat([data_exploder(raw_df, x) for x in raw_df.station.unique()])
all_data_df.shape

(284909, 3)

In [120]:
all_data_full_df = all_data_df.pipe(annualizer)
all_data_full_df.head()

Unnamed: 0,dayofyr,timestamp,base,station,snowfall,ski_yr
0,70,2010-12-02,36.0,Showdown Montana,,0
1,71,2010-12-03,36.0,Showdown Montana,,0
2,72,2010-12-04,38.0,Showdown Montana,6.0,0
3,73,2010-12-05,38.0,Showdown Montana,,0
4,75,2010-12-07,42.0,Showdown Montana,,0


In [200]:
# Pandas Groupby multiple items ALWAYS resets index to multiindex. UGH.
last_days_tmp = (all_data_full_df
                 .groupby(['station', 'ski_yr'], as_index=False)
                 [['dayofyr']]
                 .idxmax()
                 .reset_index()
                 .rename(columns={'dayofyr': 'index_'})  # srly wtf. index grabbed random col name
                )
last_days_df = (all_data_full_df
                .merge(right=last_days_tmp, 
                       left_index=True,
                       right_on='index_')
               )
last_days_df.head()

Unnamed: 0,dayofyr,timestamp,base,station_x,snowfall,ski_yr_x,station_y,ski_yr_y,index_
1944,99,2010-12-31,35.0,Showdown Montana,,0,Showdown Montana,0,26
1945,200,2011-04-10,69.0,Showdown Montana,,1,Showdown Montana,1,91
1946,198,2012-04-07,59.0,Showdown Montana,,2,Showdown Montana,2,219
1947,197,2013-04-07,48.0,Showdown Montana,1.0,3,Showdown Montana,3,340
1948,196,2014-04-06,100.0,Showdown Montana,1.0,4,Showdown Montana,4,465


### test data load

In [None]:
raw_df.pipe(data_exploder, 'Kirkwood').pipe(annualizer).query('timestamp=="2014-01-01"')

# Explore Data with Viz

In [182]:
def base_plotter(name=None, df=raw_df):
    station_data = (df
                    .pipe(data_exploder, name)
                    .pipe(annualizer))
    base_chart = (alt.Chart(station_data)
              .mark_line().encode(x='dayofyr:Q',
                                  y=alt.Y('base:Q', 
                                          aggregate='mean',
                                         ),
                                  #color='year:T',
                                  tooltip=['base_mean:Q']
                                  )
              .properties(title=alt.TitleParams(
                    text=[f'Snow Base Average at {name}'],
                    subtitle=['starts from begining of ski season']  ))
              )
    band = (alt.Chart(station_data)
            .mark_errorband(extent='ci')
            .encode(x='dayofyr:Q', 
                    y='base:Q',)
           )

    combined_chart = base_chart + band

    combined_chart.display()
    return combined_chart
_ = base_plotter('Arapahoe Basin')

# Timeseries Modeling
## ARIMA and extensions

### Modeling regions as single series
Concat all base ts from a region into single long series

In [None]:
pd.plotting.autocorrelation_plot(df["R"].resample("1y").median())

ARIMAX using snowfall data with powder -> packed powder and melt inference

#### Other models
GARCH, Gaussian Process, hidden Markov

### Panel Models

# TF LSTM models