In [1]:
import numpy as np
import pandas as pd
import altair as alt
import re
url = r'https://en.wikipedia.org/wiki/Comparison_of_North_American_ski_resorts'
stations_df = pd.read_html(url,header=0)[4]

In [2]:
stations_df.head()

Unnamed: 0,Resort name and website,Nearest city,State/province,Peak elevation (ft),Base elevation (ft),Vertical drop (ft),Skiable acreage,Total trails,Total lifts,Avg annual snowfall (in),Adult weekend lift ticket window price (USD),Date statistics updated
0,Ski Bromont,Bromont,Quebec,1854,590.0,1264.0,450,141.0,9.0,190,$54,"December 1, 2019[1]"
1,Apex Mountain Resort,Penticton,British Columbia,7197,5197.0,2000.0,1112,79.0,4.0,236,$65,"November 30, 2019[2]"
2,Canyon Ski Area,Red Deer,Alberta,2950,2412.0,538.0,80,23.0,6.0,45,$35,"November 30, 2019[3]"
3,Fernie Alpine Resort,Fernie,British Columbia,7000,3450.0,3550.0,2500,142.0,10.0,360,$90,"November 30, 2019[4]"
4,Marble Mountain Ski Resort,Steady Brook,Newfoundland and Labrador,1791,1759.0,32.0,230,40.0,5.0,192,$34,"December 1, 2019[5]"


In [37]:
region_map = {'East': ['New_York', 'Vermont', 'Pennsylvania', 'West_Virginia'],
              'Colorado': ['Colorado'],
              'Utah': ['Utah'],
              'Sierras': ['California', 'Nevada'],
              'Cascades': ['Washington', 'Oregon', 'British_Columbia'],
              'Other': ['Alabama', 'Alaska', 'Alberta', 'Arizona',
                        'Connecticut', 'Idaho', 'Illinois',
                        'Indiana', 'Iowa', 'Maine', 'Maryland', 
                        'Massachusetts',
                        'Michigan', 'Minnesota', 'Missouri', 'Montana',
                        'New Hampshire', 'New Jersey', 'New Mexico',
                        'Newfoundland_and_Labrador', 'North_Carolina', 
                        'North_Dakota', 'Nova Scotia', 'Ohio', 'Ontario',
                        'Quebec', 'Rhode_Island', 'South_Dakota', 'Tennessee',
                        'Virginia', 'Wisconsin',
                        'Wyoming']
              }
region_reverse_map = {}
for region, states in region_map.items():
    for state in states:
        region_reverse_map.update({state: region})
#region_reverse_map = {state: region for state in states for region, states in region_map.items()}
region_reverse_map

{'New_York': 'East',
 'Vermont': 'East',
 'Pennsylvania': 'East',
 'West_Virginia': 'East',
 'Colorado': 'Colorado',
 'Utah': 'Utah',
 'California': 'Sierras',
 'Nevada': 'Sierras',
 'Washington': 'Cascades',
 'Oregon': 'Cascades',
 'British_Columbia': 'Cascades',
 'Alabama': 'Other',
 'Alaska': 'Other',
 'Alberta': 'Other',
 'Arizona': 'Other',
 'Connecticut': 'Other',
 'Idaho': 'Other',
 'Illinois': 'Other',
 'Indiana': 'Other',
 'Iowa': 'Other',
 'Maine': 'Other',
 'Maryland': 'Other',
 'Massachusetts': 'Other',
 'Michigan': 'Other',
 'Minnesota': 'Other',
 'Missouri': 'Other',
 'Montana': 'Other',
 'New Hampshire': 'Other',
 'New Jersey': 'Other',
 'New Mexico': 'Other',
 'Newfoundland_and_Labrador': 'Other',
 'North_Carolina': 'Other',
 'North_Dakota': 'Other',
 'Nova Scotia': 'Other',
 'Ohio': 'Other',
 'Ontario': 'Other',
 'Quebec': 'Other',
 'Rhode_Island': 'Other',
 'South_Dakota': 'Other',
 'Tennessee': 'Other',
 'Virginia': 'Other',
 'Wisconsin': 'Other',
 'Wyoming': 'Other'

In [59]:
def junk_remover(x):
    if type(x) == str:
        result = re.sub("\[.*", "", x).replace(",", "")
    else:
        result = x
    return float(result)
        
stations2_df = (stations_df
               .rename(columns=lambda x: (x
                                          .strip()
                                          .lower()
                                          .replace(" ", "_")
                                          .replace("(", "")
                                          .replace(")", "")
                                          .replace("/", "_")))
               .rename(columns={'resort_name_and_website': 'name'})
               .assign(name=lambda x: x.name.str.replace("\[.*", "", regex=True))
               .assign(peak_elevation_ft=lambda x: x.peak_elevation_ft.apply(junk_remover))
               .assign(avg_elevation_ft=lambda x: (x.peak_elevation_ft + x.base_elevation_ft)/2)
               .assign(region=lambda x: x.state_province.map(region_reverse_map))
               .sort_values(by=['state_province', 'avg_elevation_ft'])
               .filter(items=['name', 'region', 'state_province',
                              'peak_elevation_ft', 'base_elevation_ft', 
                              'skiable_acreage', 'total_lifts', 
                              'avg_annual_snowfall_in', 'avg_elevation_ft'])
               .dropna(how='any')
               .query('region != "Other"')
               )

Index(['name', 'region', 'state_province', 'peak_elevation_ft',
       'base_elevation_ft', 'skiable_acreage', 'total_lifts',
       'avg_annual_snowfall_in', 'avg_elevation_ft'],
      dtype='object')

In [61]:
alt.Chart(stations2_df).mark_bar().encode(
    x=alt.X('name:N', sort=alt.EncodingSortField(
             field='avg_elevation_feet',
             order='ascending')),
    y=alt.Y('base_elevation_ft:Q',
            scale=alt.Scale(zero=False)),
    y2='peak_elevation_ft:Q',
    color='region:N'
)