# 1. Data Collection & Web Scraping

### Getting list of all US resorts from skiresorts.info

In [1]:
import requests
from bs4 import BeautifulSoup

resorts = []

# Gets a specified number of resorts from the skiresort.info resort list
def get_resorts_from_url(url, length):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    i = 0 # Index for only getting first n results
    temp_resorts = []

    # Find all a tags with class 'h3' and href containing 'ski-resort'
    for a in soup.find_all('a', class_='h3'):
        href = a.get('href')
        if href and 'ski-resort' in href:
            name = a.text.strip()
            temp_resorts.append({'name': name, 'url': href})
        
        i += 1
        if i >= length:
            break

    return temp_resorts

resorts.extend(get_resorts_from_url("https://www.skiresort.info/ski-resorts/usa/", 50))
resorts.extend(get_resorts_from_url("https://www.skiresort.info/ski-resorts/usa/page/2/", 200))
resorts.extend(get_resorts_from_url("https://www.skiresort.info/ski-resorts/usa/page/3/", 200))
resorts.extend(get_resorts_from_url("https://www.skiresort.info/ski-resorts/usa/page/5/", 81))

print(f"Total resorts extracted: {len(resorts)}")

Total resorts extracted: 531


### Getting resort info from skiresort.info
 - price, elevation, vertical distance, total trail length, number of chairlifts, location

In [2]:
import re
import time

us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 
             'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
             'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 
             'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 
             'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 
             'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 
             'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 
             'Wisconsin', 'Wyoming']
i = 1 # Index used for displaying progress of code

for resort in resorts:
    try:
        response = requests.get(resort['url'], timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Price info
        price_tag = soup.find('td', id='selTicketA')

        if price_tag:
            price_text = price_tag.text.strip()
            
            # Clean and convert the price to a float or int
            match = re.search(r'\d+[.,]?\d*', price_text.replace(',', '.'))
            
            if match:
                resort['price'] = float(match.group())
            else:
                resort['price'] = None
                print(f"Price not found in format for {resort['name']}: {price_text}")
        else:
            resort['price'] = None
            print(f"Price tag not found for {resort['name']}.")

        # Elevation info
        elevation_tag = soup.find('div', id='selAlti')
        if elevation_tag:
            text = elevation_tag.get_text(strip=True)

            # Extract all numbers from the string
            numbers = re.findall(r'[\d,]+', text)
            numbers = [int(n.replace(',', '')) for n in numbers]

            if len(numbers) >= 3:
                resort['base_elevation_m'] = numbers[0]
                resort['peak_elevation_m'] = numbers[1]
                resort['vertical_drop_m'] = numbers[2]
            else:
                resort['base_elevation_m'] = resort['peak_elevation_m'] = resort['vertical_drop_m'] = None
        else:
            resort['base_elevation_m'] = resort['peak_elevation_m'] = resort['vertical_drop_m'] = None
            print(f"No elevation info found for {resort['name']}")
            
        # Slope length
        slope_tag = soup.find('strong', id='selSlopetot')
        if slope_tag:
            match = re.search(r'[\d,.]+', slope_tag.text)
            resort['slope_km'] = float(match.group().replace(',', '')) if match else None
        else:
            resort['slope_km'] = None
            print(f"No slope length info found for {resort['name']}")

        # Location
        resort['state'] = None
        paragraph = soup.find(string=lambda text: text and "is located" in text)
        if paragraph:
            parent = paragraph.find_parent()
            if parent:
                links = parent.find_all("a")

                for link in links:
                    location = link.get_text()
                    for s in us_states:
                        if s in location:
                            resort['state'] = s
                            break
        else:
            print(f"No location info found for {resort['name']}")

        # Chairlift count
        lift_tag = soup.find('strong', id='selLiftstot')
        
        if lift_tag:
            # Extract digits from the text
            lift_text = lift_tag.text.strip()
            match = re.search(r'\d+', lift_text)
            if match:
                num_lifts = int(match.group())
                resort['num_lifts'] = num_lifts
        else:
            print(f"No lift info found for {resort['name']}")

        print(f"{i} / {len(resorts)} complete: {resort['name']}")
        i += 1
        # Print info after it's all extracted
        #print(f"{resort['name']}: state = {resort['state']}, price = {resort['price']}, lifts = {resort['num_lifts']}, slope = {resort['slope_km']} km, vertical = {resort['vertical_drop_m']} m")

        time.sleep(0.25) # Don't overload the API with requests

    except Exception as e:
        print(f"Error processing {resort['name']}: {e}")
        resort['slope_km'] = resort['base_elevation_m'] = resort['peak_elevation_m'] = resort['vertical_drop_m'] = None

1 / 531 complete: Big Sky Resort
2 / 531 complete: Vail
3 / 531 complete: Telluride
4 / 531 complete: Snowmass
5 / 531 complete: Mammoth Mountain
6 / 531 complete: Park City
7 / 531 complete: Deer Valley
8 / 531 complete: Beaver Creek
9 / 531 complete: Palisades Tahoe
10 / 531 complete: Breckenridge
11 / 531 complete: Keystone
12 / 531 complete: Sunday River
13 / 531 complete: Killington
14 / 531 complete: Winter Park Resort
15 / 531 complete: Bald Mountain – Sun Valley
16 / 531 complete: Heavenly
17 / 531 complete: Grand Targhee
18 / 531 complete: Snowbird
19 / 531 complete: Snowbasin
20 / 531 complete: Steamboat
21 / 531 complete: Alta
22 / 531 complete: Stowe
23 / 531 complete: Aspen Highlands
24 / 531 complete: Solitude
25 / 531 complete: Brighton
26 / 531 complete: Dollar Mountain – Sun Valley
27 / 531 complete: Copper Mountain
28 / 531 complete: Jackson Hole
29 / 531 complete: Crested Butte
30 / 531 complete: Buttermilk Mountain
31 / 531 complete: Mt. Bachelor
32 / 531 complete: 

### Converting to DataFrame for easy viewing and manipulation

In [53]:
import pandas as pd

resorts_df = pd.DataFrame(resorts)
resorts_df

Unnamed: 0,name,url,price,base_elevation_m,peak_elevation_m,vertical_drop_m,slope_km,state,num_lifts
0,Big Sky Resort,https://www.skiresort.info/ski-resort/big-sky-...,275.0,2073.0,3403.0,1330.0,250.0,Montana,40.0
1,Vail,https://www.skiresort.info/ski-resort/vail/,329.0,2457.0,3527.0,1070.0,234.0,Colorado,34.0
2,Telluride,https://www.skiresort.info/ski-resort/telluride/,260.0,2660.0,3815.0,1155.0,88.2,Colorado,19.0
3,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,264.0,2473.0,3813.0,1340.0,237.0,Colorado,20.0
4,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,259.0,2424.0,3369.0,945.0,89.8,California,25.0
...,...,...,...,...,...,...,...,...,...
526,Mott Mountain,https://www.skiresort.info/ski-resort/mott-mou...,,,,,,,
527,Carson Ski Resort,https://www.skiresort.info/ski-resort/carson-s...,,,,,,,
528,North Bicentennial Park (temporarily closed),https://www.skiresort.info/ski-resort/north-bi...,,,,,,,
529,Huntington,https://www.skiresort.info/ski-resort/huntington/,,,,,,,


### Converting metric units to imperial (only because this is US-focused)

In [54]:
resorts_df = resorts_df.rename(columns={'base_elevation_m': 'base_elevation_ft', 'peak_elevation_m': 'peak_elevation_ft',
                           'vertical_drop_m': 'vertical_drop_ft', 'slope_km': 'slope_mi'})

# Convert meters to feet (1 m ≈ 3.28084 ft)
resorts_df['base_elevation_ft'] = (resorts_df['base_elevation_ft'] * 3.28084).round().astype('Int64')
resorts_df['peak_elevation_ft'] = (resorts_df['peak_elevation_ft'] * 3.28084).round().astype('Int64')
resorts_df['vertical_drop_ft'] = (resorts_df['vertical_drop_ft'] * 3.28084).round().astype('Int64')

# Convert kilometers to miles (1 km ≈ 0.621371 mi)
resorts_df['slope_mi'] = (resorts_df['slope_mi'] * 0.621371).round().astype('Int64')

# Get rid of decimal in num_lifts
resorts_df['num_lifts'] = resorts_df['num_lifts'].astype('Int64')

resorts_df

Unnamed: 0,name,url,price,base_elevation_ft,peak_elevation_ft,vertical_drop_ft,slope_mi,state,num_lifts
0,Big Sky Resort,https://www.skiresort.info/ski-resort/big-sky-...,275.0,6801,11165,4364,155,Montana,40
1,Vail,https://www.skiresort.info/ski-resort/vail/,329.0,8061,11572,3510,145,Colorado,34
2,Telluride,https://www.skiresort.info/ski-resort/telluride/,260.0,8727,12516,3789,55,Colorado,19
3,Snowmass,https://www.skiresort.info/ski-resort/snowmass/,264.0,8114,12510,4396,147,Colorado,20
4,Mammoth Mountain,https://www.skiresort.info/ski-resort/mammoth-...,259.0,7953,11053,3100,56,California,25
...,...,...,...,...,...,...,...,...,...
526,Mott Mountain,https://www.skiresort.info/ski-resort/mott-mou...,,,,,,,
527,Carson Ski Resort,https://www.skiresort.info/ski-resort/carson-s...,,,,,,,
528,North Bicentennial Park (temporarily closed),https://www.skiresort.info/ski-resort/north-bi...,,,,,,,
529,Huntington,https://www.skiresort.info/ski-resort/huntington/,,,,,,,


### Download DataFrame as CSV

In [60]:
resorts_df.to_csv('skiresort_dot_info.csv', index=False)

*NOTE: The csv downloaded here does not contain the final data for ski resorts. Using a regular spreadsheet editor, I manually changed several ski resort names to match them between the different sources (for example, "Brighton" was changed to "Brighton Mountain"). The final csv for the data from this source is called skiresort_dot_info_new.