In [1]:
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
import re

In [2]:
alt.renderers.enable(embed_options={'theme': 'fivethirtyeight'})

RendererRegistry.enable('default')

In [3]:
url = r'https://en.wikipedia.org/wiki/Comparison_of_North_American_ski_resorts'
stations_df = pd.read_html(url,header=0)[4]
stations_df.head()

Unnamed: 0,Resort name and website,Nearest city,State/province,Peak elevation (ft),Base elevation (ft),Vertical drop (ft),Skiable acreage,Total trails,Total lifts,Avg annual snowfall (in),Adult weekend lift ticket window price (USD),Date statistics updated
0,Ski Bromont,Bromont,Quebec,1854,590.0,1264.0,450,141.0,9.0,190,$54,"December 1, 2019[1]"
1,Apex Mountain Resort,Penticton,British Columbia,7197,5197.0,2000.0,1112,79.0,4.0,236,$65,"November 30, 2019[2]"
2,Canyon Ski Area,Red Deer,Alberta,2950,2412.0,538.0,80,23.0,6.0,45,$35,"November 30, 2019[3]"
3,Fernie Alpine Resort,Fernie,British Columbia,7000,3450.0,3550.0,2500,142.0,10.0,360,$90,"November 30, 2019[4]"
4,Marble Mountain Ski Resort,Steady Brook,Newfoundland and Labrador,1791,1759.0,32.0,230,40.0,5.0,192,$34,"December 1, 2019[5]"


In [4]:
region_map = {'East': ['New_York', 'Vermont', 'Pennsylvania', 'West_Virginia'],
              'Colorado': ['Colorado'],
              'Utah': ['Utah'],
              'Sierras': ['California', 'Nevada'],
              'Cascades': ['Washington', 'Oregon', 'British_Columbia'],
              'Other': ['Alabama', 'Alaska', 'Alberta', 'Arizona',
                        'Connecticut', 'Idaho', 'Illinois',
                        'Indiana', 'Iowa', 'Maine', 'Maryland', 
                        'Massachusetts',
                        'Michigan', 'Minnesota', 'Missouri', 'Montana',
                        'New Hampshire', 'New Jersey', 'New Mexico',
                        'Newfoundland_and_Labrador', 'North_Carolina', 
                        'North_Dakota', 'Nova Scotia', 'Ohio', 'Ontario',
                        'Quebec', 'Rhode_Island', 'South_Dakota', 'Tennessee',
                        'Virginia', 'Wisconsin',
                        'Wyoming']
              }
region_reverse_map = {}
for region, states in region_map.items():
    for state in states:
        region_reverse_map.update({state: region})
#region_reverse_map = {state: region for state in states for region, states in region_map.items()}

In [5]:
def junk_cleaner(ser):
    """remove cruft preventing type conversion on numerics stored as str
    expects/returns a series"""
    ser = ser.astype("str")
    ser = (ser
          .str.replace("\[.*", "")
          .str.replace(",", "")
          .str.replace("`", ""))
    return pd.to_numeric(ser, errors='coerce')
        
stations2_df = (stations_df
               .rename(columns=lambda x: (x
                                          .strip()
                                          .lower()
                                          .replace(" ", "_")
                                          .replace("(", "")
                                          .replace(")", "")
                                          .replace("/", "_")))
               .rename(columns={'resort_name_and_website': 'name'})
               .assign(name=lambda x: x.name.str.replace("\[.*", "", regex=True))
               .assign(peak_elevation_ft=lambda x: junk_cleaner(x.peak_elevation_ft))
               .assign(skiable_acreage=lambda x: junk_cleaner(x.skiable_acreage))
               .assign(avg_annual_snowfall_in=lambda x: junk_cleaner(x.avg_annual_snowfall_in))
               .assign(avg_elevation_ft=lambda x: (x.peak_elevation_ft + x.base_elevation_ft)/2)
               .assign(region=lambda x: x.state_province.map(region_reverse_map))
               .sort_values(by=['region', 'state_province', 'avg_elevation_ft'])
               .filter(items=['name', 'region', 'state_province',
                              'peak_elevation_ft', 'base_elevation_ft', 
                              'skiable_acreage', 'total_lifts', 
                              'avg_annual_snowfall_in', 'avg_elevation_ft'])
               .dropna(how='any')
               .query('region != "Other" & skiable_acreage > 100')
               )

In [6]:
elev_chart = (alt.Chart(stations2_df)
         .mark_bar(size=18)
         .encode(
             x=alt.X('name:N', 
                     sort=alt.EncodingSortField(
                            field='avg_elevation_feet:Q',
                            order='ascending'),
                     title=None
                    ),
             y=alt.Y('base_elevation_ft:Q',
                     scale=alt.Scale(zero=False),
                     axis=alt.Axis(title='Elevation (ft)')),
             y2='peak_elevation_ft:Q',
             color=alt.Color('avg_annual_snowfall_in:Q', scale=alt.Scale(
                scheme='greenblue', )),
             opacity=alt.value(0.8),
             tooltip=['name:N', 'base_elevation_ft:Q', 
                                  'avg_elevation_ft:Q', 'peak_elevation_ft:Q',
                                  'skiable_acreage:N', 'total_lifts:N', 
                                  'avg_annual_snowfall_in:Q', 'avg_elevation_ft:Q' ]
                        )
        .properties(
            width=800,
            height=150,)
             )
combined_chart = (alt.vconcat(*(elev_chart
      .transform_filter(alt.FieldEqualPredicate(field='region', equal=val))
      .properties(title=alt.TitleParams(text=f'{val} Stations'))
      for val in stations2_df['region'].unique()
       )
                             )
                 .configure_axisBottom(labelAngle=310)
                 .configure_legend(orient='right') 
                 .resolve_axis(x='shared')
                 .properties(
                    title=alt.TitleParams(
                    text=['Ski Stations:', "Base to Peak Elevations"],
                    subtitle=["Grouped by state and by region",
                              "Only some states included"],
                    fontSize=20,
                    font='Consolas',
                    anchor='middle',
                    color='darkblue',)
                             )
                 )
combined_chart.save(r"../resources/altitude_and_snowfall_by_region.html")
combined_chart

In [7]:
grouped_elevation_plt = (alt.Chart(stations2_df)
                         .mark_boxplot(size=30,
                                       opacity=70)
                         .encode(x=alt.X('region:N'),
                                 y='avg_elevation_ft:Q',
                                 color=alt.Color(
                                     'region:N',
                                     scale=alt.Scale(scheme='dark2')))
                         .properties(width=400,
                                     title=alt.TitleParams(
                                         text=['Average Station Elevation'],
                                         subtitle=["Grouped by state and by region",
                                                   "Only some states included"],
                                         fontSize=20,
                                         font='Consolas',
                                         anchor='middle',
                                         color='darkblue',))
                         )
grouped_elevation_plt

In [8]:
def width_finder(df):
    """find with that will make chart area equal to skiable acreage
    takes df and returns series"""
    df['height'] = df.peak_elevation_ft - df.base_elevation_ft
    df['area'] = df.skiable_acreage / df.height
    return df['area']

# buffer between chart rectangles so resorts aren't smushed together
buffer = .08 

stations_wwidth_df = (stations2_df
                      .sort_values('avg_elevation_ft')
                      .assign(area_width=lambda x: 
                              width_finder(x.loc[:,['peak_elevation_ft', 
                                    'base_elevation_ft', 'skiable_acreage']]))
                      .assign(x1=lambda x: x.area_width.cumsum() + buffer)
                      .assign(x0=lambda x: x.x1.shift(fill_value=0) + buffer)
                     )
selection = alt.selection_multi(fields=['series'], bind='legend')

elev_ww_chart = (alt
                 .Chart(stations_wwidth_df)
                 .mark_rect()
                 .encode(x=alt.X('x0:Q', title='Skiable Acres',
                                 axis=alt.Axis(grid=False)),
                         x2='x1:Q',
                         y=alt.Y('base_elevation_ft:Q',
                                 scale=alt.Scale(zero=False),
                                 axis=alt.Axis(title='Elevation (ft)',
                                 grid=True)),
                         y2='peak_elevation_ft:Q',
                         color='region:N',
                         opacity=alt.condition(
                             selection, alt.value(1), alt.value(0.2)),
                         tooltip=['name:N', 'base_elevation_ft:Q', 
                                  'avg_elevation_ft:Q', 'peak_elevation_ft:Q',
                                  'skiable_acreage:N', 'total_lifts:N', 
                                  'avg_annual_snowfall_in:Q', 'avg_elevation_ft:Q' ]
                        )
                 .configure_axisBottom(labels=False, ticks=False)
                 .properties(
                            width=800,
                            height=300,
                            title=alt.TitleParams(
                                 text=['Stations Scaled by skiable area'],
                                 fontSize=20))
                 .add_selection(selection)
                )
elev_ww_chart.save(r"../resources/elevation_area.html")
elev_ww_chart

In [9]:
stations3_df = stations2_df.assign(region=stations2_df.region.astype('category'))
snow_box = (alt.Chart(stations3_df)
            .mark_boxplot(size=40)
            .encode(x=alt.X('region:N'),
                    y='avg_annual_snowfall_in:Q',
                    color=alt.Color('region:N', scale=alt.Scale(scheme='dark2'))
                    )
            .properties(width=400,
                         title=alt.TitleParams(
                                         text=['Avg Snowfall Distribution'],
                                         fontSize=20)
                        )
           ).interactive()
snow_box

In [12]:
stations3_df.to_parquet(r"../data/processed/stationswki.parquet")