In [3]:
#pip install requests beautifulsoup4

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd




In [2]:
def get_soup(url, delay=1, headers=None, proxy=None):
    """
    Fetches and parses a webpage with BeautifulSoup, handling 403 errors
    by setting headers, delays, and optional proxies.

    Parameters:
        url (str): The URL of the webpage to scrape.
        delay (int): The delay (in seconds) between requests. Default is 1 second.
        headers (dict): Optional headers to include in the request.
        proxy (dict): Optional dictionary for proxies.

    Returns:
        BeautifulSoup object of the parsed page or None if request fails.
    """

    # Default headers to mimic a browser request if none are provided
    if headers is None:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    try:
        # Send the request with headers and optional proxy
        response = requests.get(url, headers=headers, proxies=proxy)
        
        # Check for 403 Forbidden error
        if response.status_code == 403:
            print("403 Forbidden: Access to the page is restricted.")
            return None
        
        # Add delay to avoid rapid requests
        time.sleep(delay)

        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        return soup
    
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [3]:
def scrape_list(url):
    try:
        # Set headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        
        # Send a GET request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the list within the specified section and div
        standings_div = soup.find('div', class_='standings')
        if standings_div:
            list_items = standings_div.find_all('li')
            standings = [item.text.strip() for item in list_items if item.text.strip()]
            return standings
        else:
            print(f"No standings found for {url}")
            return None

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred while scraping {url}: {http_err}")
        return None
    except requests.RequestException as req_err:
        print(f"Request error occurred while scraping {url}: {req_err}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while scraping {url}: {e}")
        return None


In [4]:
# def scrape_multiple_urls(urls):
#     # Dictionary to store the results for each URL
#     results = {}
#     for url in urls:
#         print(f"Scraping data from {url}...")
#         data = scrape_list(url)
#         results[url] = data if data else "No data found"
#     return results

def scrape_multiple_urls_dict(urls):
    # Dictionary to store the results for each year
    results = {}
    for year, url in urls.items():
        print(f"Scraping data for the year {year} from {url}...")
        data = scrape_list(url)
        results[year] = data if data else "No data found"
    return results

In [5]:
urls = {
    1987: 'https://collegechampionships.usaultimate.org/d1-women/history/1987-d-i-women/',
    1988: 'https://collegechampionships.usaultimate.org/d1-women/history/1988-d-i-women/',
    1989: 'https://collegechampionships.usaultimate.org/d1-women/history/1989-d-i-women/',
    1990: 'https://collegechampionships.usaultimate.org/d1-women/history/1990-d-i-women/',
    1991: 'https://collegechampionships.usaultimate.org/d1-women/history/1991-d-i-women/',
    1992: 'https://collegechampionships.usaultimate.org/d1-women/history/1992-d-i-women/',
    1993: 'https://collegechampionships.usaultimate.org/d1-women/history/1993-d-i-women/',
    1994: 'https://collegechampionships.usaultimate.org/d1-women/history/1994-d-i-women/',
    1995: 'https://collegechampionships.usaultimate.org/d1-women/history/1995-d-i-women/',
    1996: 'https://collegechampionships.usaultimate.org/d1-women/history/1996-d-i-women/',
    1997: 'https://collegechampionships.usaultimate.org/d1-women/history/1997-d-i-women/',
    1998: 'https://collegechampionships.usaultimate.org/d1-women/history/1998-d-i-women/',
    1999: 'https://collegechampionships.usaultimate.org/d1-women/history/1999-d-i-women/',
    2000: 'https://collegechampionships.usaultimate.org/d1-women/history/2000-d-i-women/',
    2001: 'https://collegechampionships.usaultimate.org/d1-women/history/2001-d-i-women/',
    2002: 'https://collegechampionships.usaultimate.org/d1-women/history/2002-d-i-women/',
    2003: 'https://collegechampionships.usaultimate.org/d1-women/history/2003-d-i-women/',
    2004: 'https://collegechampionships.usaultimate.org/d1-women/history/2004-d-i-women/',
    2005: 'https://collegechampionships.usaultimate.org/d1-women/history/2005-d-i-women/',
    2006: 'https://collegechampionships.usaultimate.org/d1-women/history/2006-d-i-women/',
    2007: 'https://collegechampionships.usaultimate.org/d1-women/history/2007-d-i-women/',
    2008: 'https://collegechampionships.usaultimate.org/d1-women/history/2008-d-i-women/',
    2009: 'https://collegechampionships.usaultimate.org/d1-women/history/2009-d-i-women/',
    2010: 'https://collegechampionships.usaultimate.org/d1-women/history/2010-d-i-women/',
    2011: 'https://collegechampionships.usaultimate.org/d1-women/history/2011-d-i-college-womens/',
    2012: 'https://collegechampionships.usaultimate.org/d1-women/history/2012-d-i-college-womens/',
    2013: 'https://collegechampionships.usaultimate.org/d1-women/history/2013-d-i-college-womens/',
    2014: 'https://collegechampionships.usaultimate.org/d1-women/history/2014-d-i-college-womens/',
    2015: 'https://collegechampionships.usaultimate.org/d1-women/history/2015-d-i-women/',
    2016: 'https://collegechampionships.usaultimate.org/d1-women/history/2016-d-i-college-womens/',
    2017: 'https://collegechampionships.usaultimate.org/d1-women/history/2017-d-i-women/',
    2018: 'https://collegechampionships.usaultimate.org/d1-women/history/2018-d-i-women/',
    2019: 'https://collegechampionships.usaultimate.org/d1-women/history/2019-d-i-women/',
    # 2020 no season
    2021: 'https://collegechampionships.usaultimate.org/d1-women/history/2019-d-i-women-2/',
    2022: 'https://collegechampionships.usaultimate.org/d1-women/history/2019-d-i-women-2-2/',
    2023: 'https://collegechampionships.usaultimate.org/d1-women/history/2019-d-i-women-2-2-2/'
}

In [6]:
all_standings = scrape_multiple_urls_dict(urls)

Scraping data for the year 1987 from https://collegechampionships.usaultimate.org/d1-women/history/1987-d-i-women/...
Scraping data for the year 1988 from https://collegechampionships.usaultimate.org/d1-women/history/1988-d-i-women/...
Scraping data for the year 1989 from https://collegechampionships.usaultimate.org/d1-women/history/1989-d-i-women/...
Scraping data for the year 1990 from https://collegechampionships.usaultimate.org/d1-women/history/1990-d-i-women/...
Scraping data for the year 1991 from https://collegechampionships.usaultimate.org/d1-women/history/1991-d-i-women/...
Scraping data for the year 1992 from https://collegechampionships.usaultimate.org/d1-women/history/1992-d-i-women/...
Scraping data for the year 1993 from https://collegechampionships.usaultimate.org/d1-women/history/1993-d-i-women/...
Scraping data for the year 1994 from https://collegechampionships.usaultimate.org/d1-women/history/1994-d-i-women/...
Scraping data for the year 1995 from https://collegecham

In [7]:
for url, standings in all_standings.items():
    print(f"\nStandings for {url}:")
    print(standings)


Standings for 1987:
['1-Kansas', '2-California- Davis', '3-Humboldt State', '4-Massachusetts', '5-Cornell', '6-Earlham', '7-Vermont']

Standings for 1988:
['1-California-Santa Barbara', '2-California-Davis', '3T-Humboldt State', '3T-Oregon', '5T-Carleton College', '5T-Massachusetts', '7T-Cornell', '7T-Wisconsin', '9T-Florida', '9T-Kansas']

Standings for 1989:
['1-California-Davis', '2-Michigan', '3T-California- Santa Barbara', '3T-Carleton College', '?-Cornell', '?-North Carolina-Wilmington', '?-Oregon', '?-Pennsylvania', '?-SUNY-Binghampton', '?-Towson']

Standings for 1990:
['1-California-Santa Barbara', '2-Michigan', '3-Cornell', '4-Carleton College', '5-California-Davis', '6-Carnegie Mellon', '7T-Columbia', '7T-Kansas', '9T-Pennsylvania', '9T-Wesleyan']

Standings for 1991:
['1-California- Santa Barbara', '2-California', '3T-Carleton College', '3T-Cornell', '5T-North Carolina-Wilmington', '5T-Virginia', '7T-Columbia', '7T-Kansas', '9T-Carnegie Mellon', '9T-Tufts']

Standings for 

In [27]:
# Fix entries in 2022 standings
if 2022 in all_standings:
    all_standings[2022] = [
        entry.replace('T5 –Vermont', 'T5 – Vermont')
        .replace('T9 –Stanford', 'T9 – Stanford')
        .replace('T15 –William & Mary', 'T15 – William & Mary')
        for entry in all_standings[2022]
    ]

# Verify the fix
print("2022 standings after fix:")
for entry in all_standings[2022]:
    print(entry)

2022 standings after fix:
1 – North Carolina
2 – Colorado
T3 – Carleton College
T3 – California-Santa Barbara
T5 – British Columbia
T5 – Tufts
T5 – Washington
T5 – Vermont
T9 – California-Davis
T9 – California-San Diego
T9 – Florida State
T9 – Stanford
T13 – Georgia
T13 – Pittsburgh
T15 – William & Mary
T15 – Virginia
T17 – Purdue
T17 – Northeastern
T19 – SUNY-Binghamton
T19 – Colorado State


In [28]:
# Create empty lists to store our data
data = []

# Iterate through the standings dictionary
for year, standings in all_standings.items():
    if isinstance(standings, list):  # Only process if we have actual standings data
        for i, entry in enumerate(standings, 1):
            if year > 2017:
                # Try both types of dashes for newer years
                if ' – ' in entry:  # en dash
                    parts = entry.split(' – ', 1)
                elif ' - ' in entry:  # regular hyphen
                    parts = entry.split(' - ', 1)
                else:
                    parts = entry.split('-', 1)
            else:
                # Pre-2016 format
                parts = entry.split('-', 1)
            
            if len(parts) == 2:
                web_finish = parts[0].strip()
                team = parts[1].strip()
                
                data.append({
                    'URL': urls[year],
                    'Year': year,
                    'Team': team,
                    'Web_Finish': web_finish,
                    'List_Finish': i
                })

# Create DataFrame
df = pd.DataFrame(data)

In [29]:
df

Unnamed: 0,URL,Year,Team,Web_Finish,List_Finish
0,https://collegechampionships.usaultimate.org/d...,1987,Kansas,1,1
1,https://collegechampionships.usaultimate.org/d...,1987,California- Davis,2,2
2,https://collegechampionships.usaultimate.org/d...,1987,Humboldt State,3,3
3,https://collegechampionships.usaultimate.org/d...,1987,Massachusetts,4,4
4,https://collegechampionships.usaultimate.org/d...,1987,Cornell,5,5
...,...,...,...,...,...
547,https://collegechampionships.usaultimate.org/d...,2023,Victoria,T15,16
548,https://collegechampionships.usaultimate.org/d...,2023,Georgia,T17,17
549,https://collegechampionships.usaultimate.org/d...,2023,SUNY-Binghamton,T17,18
550,https://collegechampionships.usaultimate.org/d...,2023,Carnegie Mellon,T19,19


In [30]:
# # Export to CSV
# df.to_csv('college-womens-raw-rankings.csv', index=False)
# print("Data exported to 'college-womens-raw-rankings.csv'")

In [31]:
# Get unique team names and sort alphabetically
unique_teams_raw = sorted(df['Team'].unique())

# Print the teams
print("All raw teams in alphabetical order:")
for team in unique_teams_raw:
    print(team)

All raw teams in alphabetical order:
Arizona
Boston University
British Columbia
Brown
Bucknell
Calif0rnia
California
California- Davis
California- San Diego
California- Santa Barbara
California- Santa Cruz
California-Davis
California-San Diego
California-Santa Barbara
California-Santa Cruz
Carleton College
Carleton College-Syzygy
Carleton-Syzygy
Carnegie Mellon
Central Florida
Chicago
Claremont
Colorado
Colorado College
Colorado State
Columbia
Connecticut
Cornell
Dartmouth
Delaware
Duke
Earlham
East Carolina
Emory
Florida
Florida State
Georgia
Georgia Tech
Harvard
Humboldt State
Illinois
Indiana
Iowa
Iowa State
Kansas
MIT
Maryland
Massachusetts
Michigan
Michigan State
Middlebury
Minnesota
NYU
North Carolina
North Carolina State
North Carolina- Wilmington
North Carolina-Wilmington
Northeastern
Northwestern
Notre Dame
Oberlin
Ohio
Ohio State
Oregon
Ottawa
PIttsburgh
Penn State
Pennsylvania
Pittsburgh
Princeton
Purdue
Rice
Rutgers
SUNY-Binghampton
SUNY-Binghamton
Saint Louis
Smith
Sonoma 

In [32]:
def clean_team_name(team):
    # Apply the cleaning function to create new column
    team_fixes = {
        'Calif0rnia': 'California',
        'PIttsburgh': 'Pittsburgh',
        'Western Washington**': 'Western Washington',
        'SUNY-Binghampton': 'SUNY-Binghamton',
        'California- Davis': 'UC Davis',
        'California-Davis': 'UC Davis',
        'California- San Diego': 'UC San Diego',
        'California-San Diego': 'UC San Diego',
        'California- Santa Barbara': 'UC Santa Barbara',
        'California-Santa Barbara': 'UC Santa Barbara',
        'California- Santa Cruz': 'UC Santa Cruz',
        'California-Santa Cruz': 'UC Santa Cruz',
        'Carleton College-Syzygy': 'Carleton',
        'Carleton-Syzygy': 'Carleton',
        'Carleton College': 'Carleton'
    }
    
    return team_fixes.get(team, team)

In [33]:
# Apply the cleaning function to create new column
df['Team_Clean'] = df['Team'].apply(clean_team_name)

# Print unique teams after cleaning to verify changes
print("All teams after cleaning (alphabetically):")
for team in sorted(df['Team_Clean'].unique()):
    print(team)

All teams after cleaning (alphabetically):
Arizona
Boston University
British Columbia
Brown
Bucknell
California
Carleton
Carnegie Mellon
Central Florida
Chicago
Claremont
Colorado
Colorado College
Colorado State
Columbia
Connecticut
Cornell
Dartmouth
Delaware
Duke
Earlham
East Carolina
Emory
Florida
Florida State
Georgia
Georgia Tech
Harvard
Humboldt State
Illinois
Indiana
Iowa
Iowa State
Kansas
MIT
Maryland
Massachusetts
Michigan
Michigan State
Middlebury
Minnesota
NYU
North Carolina
North Carolina State
North Carolina- Wilmington
North Carolina-Wilmington
Northeastern
Northwestern
Notre Dame
Oberlin
Ohio
Ohio State
Oregon
Ottawa
Penn State
Pennsylvania
Pittsburgh
Princeton
Purdue
Rice
Rutgers
SUNY-Binghamton
Saint Louis
Smith
Sonoma State
Southern California
Stanford
Swarthmore
Texas
Texas A&M
Texas-Dallas
Towson
Truman State
Tufts
UC Davis
UC San Diego
UC Santa Barbara
UC Santa Cruz
UCLA
Vermont
Victoria
Virginia
Wake Forest
Washington
Washington University
Wesleyan
West Chester
Wes

In [34]:
df

Unnamed: 0,URL,Year,Team,Web_Finish,List_Finish,Team_Clean
0,https://collegechampionships.usaultimate.org/d...,1987,Kansas,1,1,Kansas
1,https://collegechampionships.usaultimate.org/d...,1987,California- Davis,2,2,UC Davis
2,https://collegechampionships.usaultimate.org/d...,1987,Humboldt State,3,3,Humboldt State
3,https://collegechampionships.usaultimate.org/d...,1987,Massachusetts,4,4,Massachusetts
4,https://collegechampionships.usaultimate.org/d...,1987,Cornell,5,5,Cornell
...,...,...,...,...,...,...
547,https://collegechampionships.usaultimate.org/d...,2023,Victoria,T15,16,Victoria
548,https://collegechampionships.usaultimate.org/d...,2023,Georgia,T17,17,Georgia
549,https://collegechampionships.usaultimate.org/d...,2023,SUNY-Binghamton,T17,18,SUNY-Binghamton
550,https://collegechampionships.usaultimate.org/d...,2023,Carnegie Mellon,T19,19,Carnegie Mellon


In [35]:
# # Create new dataframe without URL and Team columns
# df_clean = df.drop(['URL', 'Team', 'Web_Finish'], axis=1)

# # Rename Team_Clean to Team for clarity
# df_clean = df_clean.rename(columns={'Team_Clean': 'Team', 'List_Finish': 'Rank'})

# # Create Time column accounting for missing 2020 season
# df_clean['Time'] = df_clean.apply(lambda row: 
#     row['Year'] - 1983 if row['Year'] < 2020 
#     else row['Year'] - 1984, axis=1)

# # Reorder columns
# df_clean = df_clean[['Team', 'Time', 'Rank', 'Year']]

# # Verify the first few rows to check the mapping
# df_clean

# # Export to CSV
# df_clean.to_csv('ultimate_standings_clean.csv', index=False)
# print("Data exported to 'ultimate_standings_clean.csv'")

In [36]:
def transform_rank_format(df, column_name):
    def convert_format(rank):
        # If it's already in T# format, keep it
        if isinstance(rank, str) and rank.startswith('T'):
            return rank
        
        # If it ends with T, convert to T# format
        if isinstance(rank, str) and rank.endswith('T'):
            number = rank[:-1]  # Remove the T
            return f'T{number}'
        
        return rank  # Return unchanged if no T

    # Apply the conversion to the specified column
    df[column_name] = df[column_name].apply(convert_format)
    
    return df

In [37]:
# Create new dataframe without URL and Team columns
df_T = df.drop(['URL', 'Team'], axis=1)

# Apply the conversion to the specified column
df = transform_rank_format(df_T, 'Web_Finish')

# Rename Team_Clean to Team for clarity
df_T = df_T.rename(columns={'Team_Clean': 'Team', 'List_Finish': 'Rank', 'Web_Finish': 'T_Rank'})

# Reorder columns
df_T = df_T[['Team', 'Year', 'Rank', 'T_Rank']]

# Manually change a rank value for UNC Wilmington in 2021
df_T.loc[(df_T['Team'] == 'UNC Wilmington') & (df_T['Year'] == 2021), 'T_Rank'] = 'T9'

# Verify the first few rows to check the mapping
df_T

# Export to CSV
df_T.to_csv('college-womens-rankings.csv', index=False)
print("Data exported to 'college-womens-rankings.csv'")

Data exported to 'college-womens-rankings.csv'


In [38]:
# # Read the CSV
# df_with_less_columns_for_tableau = pd.read_csv('ultimate_standings_clean.csv')

# # Group by year and check rank sequences
# found_issues = False
# for year in sorted(df_with_less_columns_for_tableau['Year'].unique()):
#     year_data = df_with_less_columns_for_tableau[df_with_less_columns_for_tableau['Year'] == year].sort_values('Rank')
#     ranks = year_data['Rank'].tolist()
#     expected_ranks = list(range(1, len(ranks) + 1))
    
#     if ranks != expected_ranks:
#         found_issues = True
#         print(f"\nYear {year} has non-sequential ranks:")
#         print("Expected:", expected_ranks)
#         print("Actual:", ranks)
#         print("\nFull data for this year:")
#         print(year_data[['Team', 'Rank']].to_string())

# if not found_issues:
#     print("All years have sequential ranks without gaps! ✓")

In [39]:
df

Unnamed: 0,Year,Web_Finish,List_Finish,Team_Clean
0,1987,1,1,Kansas
1,1987,2,2,UC Davis
2,1987,3,3,Humboldt State
3,1987,4,4,Massachusetts
4,1987,5,5,Cornell
...,...,...,...,...
547,2023,T15,16,Victoria
548,2023,T17,17,Georgia
549,2023,T17,18,SUNY-Binghamton
550,2023,T19,19,Carnegie Mellon


In [40]:
# # Make a big df with logo urls

# # Read in the csv with the team names and urls
# team_logos_df = pd.read_csv('team_logos.csv')

# team_logos_df

# # Remove duplicates from team_logos_df before merging
# team_logos_df_unique = team_logos_df.drop_duplicates('team_name')

# team_logos_df_unique

In [41]:
# # Merge the two dfs on the team name

# # This will:
# # Match rows where df['Team_Clean'] equals team_logos_df['team_name']
# # Keep all rows from df (due to left join)
# # Add all columns from team_logos_df
# # Optionally remove the duplicate team name column

# # Merge using left_on and right_on to specify the different column names
# df_merged = pd.merge(
#     df, 
#     team_logos_df_unique, 
#     left_on='Team_Clean',
#     right_on='team_nam
#     how='left'
# )

# df_merged

In [42]:
# # Export the merged DataFrame to a CSV file
# df_merged.to_csv('ultimate_standings_merged.csv', index=False)
# print("Data exported to 'ultimate_standings_merged.csv'")

In [43]:
# # Add missing logos manually

# teams_with_missing_logos = [
#     #'Arizona',
#     #'Boston College',
#     #'Carnegie Mellon',
#     #'Case Western Reserve',
#     #'Chabot Community College',
#     'Colorado College',
#     'Cornell',
#     'Dartmouth',
#     'Delaware',
#     'East Carolina',
#     'Eastern Michigan',
#     'Florida State',
#     'George Washington',
#     'Glassboro',
#     'Indiana',
#     'Iowa',
#     'Kansas',
#     'Las Positas College',
#     'Luther',
#     'Michigan State',
#     'Middlebury',
#     'Minnesota-Duluth',
#     'North Texas',
#     'Notre Dame',
#     'Oberlin',
#     'Ohio',
#     'Penn',
#     'Princeton',
#     "Queen's",
#     'Rice',
#     'Rutgers',
#     'Saint Louis',
#     'Salisbury',
#     'Suny-Albany',
#     'Suny-Binghampton',
#     'Suny-Purchase',
#     'Sw Missouri State',
#     'Swarthmore',
#     'Syracuse',
#     'Texas State',
#     'UC Davis',
#     'Utah',
#     'Virginia',
#     'Wesleyan',
#     'Whitman',
#     'Williams',
#     'Wilmington',
#     'Winona State',
#     'Yale'
# ]

In [44]:
# # Get unique teams and sort them
# unique_teams = sorted(df_merged['Team_Clean'].unique())


# # Print them to verify
# for team in unique_teams:
#     print(team)

In [45]:
# Adding team results manually, for 2024

# Read in the existing CSV
df_T = pd.read_csv('college-womens-rankings.csv')

# Remove any existing 2024 entries
df_T = df_T[df_T['Year'] != 2024]

# Create new data
new_data = {
    'Team': [
        'North Carolina', 'Stanford', 'Colorado', 'Vermont', 'British Columbia',
        'Carleton', 'Oregon', 'Tufts', 'UC San Diego', 'Michigan',
        'Pennsylvania', 'UC Santa Barbara', 'Colorado State', 'Western Washington', 'SUNY Binghamton',
        'Washington', 'UC Santa Cruz', 'Georgia', 'Utah', 'Victoria'
    ],
    'Year': [2024] * 20,
    'Rank': list(range(1, 21)),
    'T_Rank': [
        '1', '2', 'T3', 'T3', 'T5', 'T5', 'T5', 'T5', 'T9', 'T9',
        'T9', 'T9', 'T13', 'T13', 'T15', 'T15', 'T17', 'T17', 'T19', 'T19'
    ]
}

# Convert to DataFrame
new_df = pd.DataFrame(new_data)

# Append to existing DataFrame
df_T = pd.concat([df_T, new_df], ignore_index=True)

# Export to CSV
df_T.to_csv('college-womens-rankings.csv', index=False)