In [3]:
#pip install requests beautifulsoup4

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd



In [69]:
def get_soup(url, delay=1, headers=None, proxy=None):
    """
    Fetches and parses a webpage with BeautifulSoup, handling 403 errors
    by setting headers, delays, and optional proxies.

    Parameters:
        url (str): The URL of the webpage to scrape.
        delay (int): The delay (in seconds) between requests. Default is 1 second.
        headers (dict): Optional headers to include in the request.
        proxy (dict): Optional dictionary for proxies.

    Returns:
        BeautifulSoup object of the parsed page or None if request fails.
    """

    # Default headers to mimic a browser request if none are provided
    if headers is None:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    try:
        # Send the request with headers and optional proxy
        response = requests.get(url, headers=headers, proxies=proxy)
        
        # Check for 403 Forbidden error
        if response.status_code == 403:
            print("403 Forbidden: Access to the page is restricted.")
            return None
        
        # Add delay to avoid rapid requests
        time.sleep(delay)

        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        return soup
    
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [70]:
# soup = get_soup(URL, delay=2)
# if soup:
#     print(soup.prettify())


In [71]:
def scrape_list(url):
    try:
        # Set headers to mimic a browser request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        
        # Send a GET request to the URL
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the list within the specified section and div
        standings_div = soup.find('div', class_='standings')
        if standings_div:
            list_items = standings_div.find_all('li')
            standings = [item.text.strip() for item in list_items if item.text.strip()]
            return standings
        else:
            print(f"No standings found for {url}")
            return None

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred while scraping {url}: {http_err}")
        return None
    except requests.RequestException as req_err:
        print(f"Request error occurred while scraping {url}: {req_err}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while scraping {url}: {e}")
        return None


In [72]:
# def scrape_multiple_urls(urls):
#     # Dictionary to store the results for each URL
#     results = {}
#     for url in urls:
#         print(f"Scraping data from {url}...")
#         data = scrape_list(url)
#         results[url] = data if data else "No data found"
#     return results

def scrape_multiple_urls_dict(urls):
    # Dictionary to store the results for each year
    results = {}
    for year, url in urls.items():
        print(f"Scraping data for the year {year} from {url}...")
        data = scrape_list(url)
        results[year] = data if data else "No data found"
    return results

In [73]:
urls = {
    1984: 'https://collegechampionships.usaultimate.org/d1-men/history/1984-d-i-men/',
    1985: 'https://collegechampionships.usaultimate.org/d1-men/history/1985-d-i-men/',
    1986: 'https://collegechampionships.usaultimate.org/d1-men/history/1986-d-i-men/',
    1987: 'https://collegechampionships.usaultimate.org/d1-men/history/1987-d-i-men/',
    1988: 'https://collegechampionships.usaultimate.org/d1-men/history/1988-d-i-men/',
    1989: 'https://collegechampionships.usaultimate.org/d1-men/history/1989-d-i-men/',
    1990: 'https://collegechampionships.usaultimate.org/d1-men/history/1990-d-i-men/',
    1991: 'https://collegechampionships.usaultimate.org/d1-men/history/1991-d-i-men/',
    1992: 'https://collegechampionships.usaultimate.org/d1-men/history/1992-d-i-men/',
    1993: 'https://collegechampionships.usaultimate.org/d1-men/history/1993-d-i-men/',
    1994: 'https://collegechampionships.usaultimate.org/d1-men/history/1994-d-i-men/',
    1995: 'https://collegechampionships.usaultimate.org/d1-men/history/1995-d-i-men/',
    1996: 'https://collegechampionships.usaultimate.org/d1-men/history/1996-d-i-men/',
    1997: 'https://collegechampionships.usaultimate.org/d1-men/history/1997-d-i-men/',
    1998: 'https://collegechampionships.usaultimate.org/d1-men/history/1998-d-i-men/',
    1999: 'https://collegechampionships.usaultimate.org/d1-men/history/1999-d-i-men/',
    2000: 'https://collegechampionships.usaultimate.org/d1-men/history/2000-d-i-men/',
    2001: 'https://collegechampionships.usaultimate.org/d1-men/history/2001-d-i-men/',
    2002: 'https://collegechampionships.usaultimate.org/d1-men/history/2002-d-i-men/',
    2003: 'https://collegechampionships.usaultimate.org/d1-men/history/2003-d-i-men/',
    2004: 'https://collegechampionships.usaultimate.org/d1-men/history/2004-d-i-men/',
    2005: 'https://collegechampionships.usaultimate.org/d1-men/history/2005-d-i-men/',
    2006: 'https://collegechampionships.usaultimate.org/d1-men/history/2006-d-i-men/',
    2007: 'https://collegechampionships.usaultimate.org/d1-men/history/2007-d-i-men/',
    2008: 'https://collegechampionships.usaultimate.org/d1-men/history/2008-d-i-men/',
    2009: 'https://collegechampionships.usaultimate.org/d1-men/history/2009-di-men/',
    2010: 'https://collegechampionships.usaultimate.org/d1-men/history/2010-d1-men/',
    2011: 'https://collegechampionships.usaultimate.org/d1-men/history/2011-d1-men/',
    2012: 'https://collegechampionships.usaultimate.org/d1-men/history/2012-d1-men/',
    2013: 'https://collegechampionships.usaultimate.org/d1-men/history/2013-d1-men/',
    2014: 'https://collegechampionships.usaultimate.org/d1-men/history/2014-d1-men/',
    2015: 'https://collegechampionships.usaultimate.org/d1-men/history/2015-d1-men/',
    2016: 'https://collegechampionships.usaultimate.org/d1-men/history/2016-d1-men/',
    2017: 'https://collegechampionships.usaultimate.org/d1-men/history/2017-d1-men/',
    2018: 'https://collegechampionships.usaultimate.org/d1-men/history/2018-d-i-men/',
    2019: 'https://collegechampionships.usaultimate.org/d1-men/history/2019-d-i-men/',
    # 2020 no season
    2021: 'https://collegechampionships.usaultimate.org/d1-men/history/2019-d-i-men-2/',
    2022: 'https://collegechampionships.usaultimate.org/d1-men/history/2019-d-i-men-2-2/',
    2023: 'https://collegechampionships.usaultimate.org/d1-men/history/2019-d-i-men-2-2-2/',
}

In [74]:
all_standings = scrape_multiple_urls_dict(urls)

Scraping data for the year 1984 from https://collegechampionships.usaultimate.org/d1-men/history/1984-d-i-men/...
Scraping data for the year 1985 from https://collegechampionships.usaultimate.org/d1-men/history/1985-d-i-men/...
Scraping data for the year 1986 from https://collegechampionships.usaultimate.org/d1-men/history/1986-d-i-men/...
Scraping data for the year 1987 from https://collegechampionships.usaultimate.org/d1-men/history/1987-d-i-men/...
Scraping data for the year 1988 from https://collegechampionships.usaultimate.org/d1-men/history/1988-d-i-men/...
Scraping data for the year 1989 from https://collegechampionships.usaultimate.org/d1-men/history/1989-d-i-men/...
Scraping data for the year 1990 from https://collegechampionships.usaultimate.org/d1-men/history/1990-d-i-men/...
Scraping data for the year 1991 from https://collegechampionships.usaultimate.org/d1-men/history/1991-d-i-men/...
Scraping data for the year 1992 from https://collegechampionships.usaultimate.org/d1-men

In [75]:
for url, standings in all_standings.items():
    print(f"\nStandings for {url}:")
    print(standings)


Standings for 1984:
['1- Stanford', '2-Glassboro', '3T-Massachusetts', '3T-Pennsylvania', '?-Chabot Community College', '?-Kansas', '?-Ohio', '?-Syracuse', '?-Texas', '?-Tufts']

Standings for 1985:
['1-Pennsylvania', '2-Massachusetts', '3T-Cornell', '3T-SW Missouri State', '5T-Stanford', '5T-Texas', '7T-Cal Poly SLO', '7T-Kansas', '9T-MIT', '9T-Oregon', '11T-Central Florida', '11T-Princeton']

Standings for 1986:
['1-Massachusetts', '2-Stanford', '3T-California-Santa Barbara', '3T-Cornell', '5T-Oregon', '5T-SW Missouri State', '7T-Kansas', '7T-MIT', '9T-Carnegie Mellon', '9T-Princeton', '11T-Georgia', '11T-Texas']

Standings for 1987:
['1-Chabot Community College', '2-California-Santa Barbara', '3T-Cal Poly SLO', '3T-Cornell', '5T-SW Missouri State', '5T-Texas', '7T-Georgia Tech', '7T-SUNY-Purchase', '9T-Carnegie Mellon', '9T-Princeton', '11T-Kansas', '11T-Michigan', '13T-East Carolina', '13T-Vermont']

Standings for 1988:
['1-California-Santa Barbara', '2-Texas', '3-Stanford', '4-Co

In [76]:
# Fix Pittsburgh entry in 2022 standings
if 2022 in all_standings:
    all_standings[2022] = [
        entry.replace('T3 –Pittsburgh', 'T3 – Pittsburgh')  # Note the en dash (–) not hyphen (-)
        for entry in all_standings[2022]
    ]

# Verify the fix
print("2022 standings after fix:")
for entry in all_standings[2022]:
    print(entry)

2022 standings after fix:
1 – North Carolina
2 – Brown
T3 – Colorado
T3 – Pittsburgh
T5 – Minnesota
T5 – California
T5 – Georgia
T5 – Cal Poly-SLO
T9 – Vermont
T9 – Washington
T9 – Texas
T9 – Auburn
T13 – Utah State
T13 – Wisconsin
T15 – Michigan
T15 – North Carolina State
T17 – William & Mary
T17 – Washington University
T19 – Connecticut
T19 – Ohio State


In [77]:
# Create empty lists to store our data
data = []

# Iterate through the standings dictionary
for year, standings in all_standings.items():
    if isinstance(standings, list):  # Only process if we have actual standings data
        for i, entry in enumerate(standings, 1):
            if year > 2016:
                # Try both types of dashes for newer years
                if ' – ' in entry:  # en dash
                    parts = entry.split(' – ', 1)
                elif ' - ' in entry:  # regular hyphen
                    parts = entry.split(' - ', 1)
                else:
                    parts = entry.split('-', 1)
            else:
                # Pre-2016 format
                parts = entry.split('-', 1)
            
            if len(parts) == 2:
                web_finish = parts[0].strip()
                team = parts[1].strip()
                
                data.append({
                    'URL': urls[year],
                    'Year': year,
                    'Team': team,
                    'Web_Finish': web_finish,
                    'List_Finish': i
                })

# Create DataFrame
df = pd.DataFrame(data)

In [78]:
# Export to CSV
df.to_csv('ultimate_standings_raw.csv', index=False)
print("Data exported to 'ultimate_standings_raw.csv'")

Data exported to 'ultimate_standings_raw.csv'


In [79]:
# Get unique team names and sort alphabetically
unique_teams_raw = sorted(df['Team'].unique())

# Print the teams
print("All raw teams in alphabetical order:")
for team in unique_teams_raw:
    print(team)

All raw teams in alphabetical order:
Arizona
Auburn
Boston College
British Columbia
Brown
Cal Poly SLO
Cal Poly- SLO
Cal Poly-SLO
California
California -Santa Cruz
California- Davis
California- San Diego
California- Santa Barbara
California- Santa Cruz
California-Davis
California-San Diego
California-Santa Barbara
California-Santa Crus\z
California-Santa Cruz
Carleton
Carleton College
Carleton-CUT
Carnegie Mellon
Case Western Reserve
Central Florida
Chabot Community College
Cincinnati
Colorado
Colorado College
Colorado State
Columbia
Connecticut
Cornell
Dartmouth
Delaware
Duke
East Carolina
Eastern Michigan
Florida
Florida State
George Washington
Georgetown
Georgia
Georgia Tech
Glassboro
Harvard
Harvarrd
Illinois
Indiana
Iowa
Iowa State
Kansas
LSU
Las Positas
Luther
MIT
Maryland
Massachusetts
Massachussets
Michigan
Michigan State
Middlebury
Minnesota
Minnesota- Duluth
North Carolina
North Carolina State
North Carolina- Wilmington
North Carolina-Wilmington
North Texas
Northeastern
North

In [80]:
def clean_team_name(name):
    # First, strip whitespace and convert to title case
    name = name.strip().title()
    
    # Fix specific typos
    name = name.replace("Harvarrd", "Harvard")
    name = name.replace("California-Santa Crus\\z", "California-Santa Cruz")
    name = name.replace("Massachussets", "Massachusetts")
    name = name.replace("Sw Missouri State", "SW Missouri State")
    
    # Standardize hyphenation
    name = name.replace(" - ", "-")
    name = name.replace("- ", "-")
    name = name.replace(" -", "-")
    
    # Standardize common school names
    replacements = {
        "California-Santa Barbara": "UC Santa Barbara",
        "California-Davis": "UC Davis",
        "California-San Diego": "UC San Diego",
        "California-Santa Cruz": "UC Santa Cruz",
        "California-Santa Crus\Z": "UC Santa Cruz",
        "Cal Poly Slo": "Cal Poly SLO",
        "Cal Poly-Slo": "Cal Poly SLO",
        "Cal Poly- Slo": "Cal Poly SLO",
        "North Carolina-Wilmington": "UNC Wilmington",
        "North Carolina- Wilmington": "UNC Wilmington",
        #"Massachusetts": "UMass",
        "Carleton College": "Carleton",
        "Carleton-Cut": "Carleton",
        "William and Mary": "William & Mary",
        "William And Mary": "William & Mary",
        "Washington University": "WashU",
        "Pennsylvania": "Penn",
        "Southwest Missouri State": "SW Missouri State",
        "Las Positas": "Las Positas College",
        "Suny-": "SUNY-",  # Ensure SUNY prefix is capitalized
        "North Carolina State": "NC State",
        "Queen’S": "Queen's",
        "Mit":"MIT",
        "Lsu":"LSU",
        "Wilmington":"UNC Wilmington",
        "North Carolina-Wilmington":"UNC Wilmington",
        "Suny-Purchase":"SUNY Purchase",
        "Suny-Binghamton":"SUNY Binghamton",
        "Suny-Albany":"SUNY Albany",
        "SUNY-Albany":"SUNY Albany",
        "SUNY-Binghampton":"SUNY Binghamton",
        "SUNY-Purchase":"SUNY Purchase"
    }
    
    for old, new in replacements.items():
        if name.upper() == old.upper():  # Case-insensitive replacement
            name = new
    
    return name

In [81]:
# Apply the cleaning function to create new column
df['Team_Clean'] = df['Team'].apply(clean_team_name)

# Print unique teams after cleaning to verify changes
print("All teams after cleaning (alphabetically):")
for team in sorted(df['Team_Clean'].unique()):
    print(team)

All teams after cleaning (alphabetically):
Arizona
Auburn
Boston College
British Columbia
Brown
Cal Poly SLO
California
Carleton
Carnegie Mellon
Case Western Reserve
Central Florida
Chabot Community College
Cincinnati
Colorado
Colorado College
Colorado State
Columbia
Connecticut
Cornell
Dartmouth
Delaware
Duke
East Carolina
Eastern Michigan
Florida
Florida State
George Washington
Georgetown
Georgia
Georgia Tech
Glassboro
Harvard
Illinois
Indiana
Iowa
Iowa State
Kansas
LSU
Las Positas College
Luther
MIT
Maryland
Massachusetts
Michigan
Michigan State
Middlebury
Minnesota
Minnesota-Duluth
NC State
North Carolina
North Texas
Northeastern
Northwestern
Notre Dame
Oberlin
Ohio
Ohio State
Oregon
Oregon State
Penn
Penn State
Pittsburgh
Princeton
Queen's
Rice
Rutgers
SUNY Albany
SUNY Binghamton
SUNY Purchase
SW Missouri State
Saint Louis
Salisbury
Stanford
Swarthmore
Syracuse
Texas
Texas A&M
Texas State
Tufts
Tulane
UC Davis
UC San Diego
UC Santa Barbara
UC Santa Cruz
UNC Wilmington
Utah
Utah St

In [82]:
df

Unnamed: 0,URL,Year,Team,Web_Finish,List_Finish,Team_Clean
0,https://collegechampionships.usaultimate.org/d...,1984,Stanford,1,1,Stanford
1,https://collegechampionships.usaultimate.org/d...,1984,Glassboro,2,2,Glassboro
2,https://collegechampionships.usaultimate.org/d...,1984,Massachusetts,3T,3,Massachusetts
3,https://collegechampionships.usaultimate.org/d...,1984,Pennsylvania,3T,4,Penn
4,https://collegechampionships.usaultimate.org/d...,1984,Chabot Community College,?,5,Chabot Community College
...,...,...,...,...,...,...
615,https://collegechampionships.usaultimate.org/d...,2023,Washington,T15,16,Washington
616,https://collegechampionships.usaultimate.org/d...,2023,Tufts,T17,17,Tufts
617,https://collegechampionships.usaultimate.org/d...,2023,Utah State,T17,18,Utah State
618,https://collegechampionships.usaultimate.org/d...,2023,Cornell,T19,19,Cornell


In [83]:
# # Create new dataframe without URL and Team columns
# df_clean = df.drop(['URL', 'Team', 'Web_Finish'], axis=1)

# # Rename Team_Clean to Team for clarity
# df_clean = df_clean.rename(columns={'Team_Clean': 'Team', 'List_Finish': 'Rank'})

# # Create Time column accounting for missing 2020 season
# df_clean['Time'] = df_clean.apply(lambda row: 
#     row['Year'] - 1983 if row['Year'] < 2020 
#     else row['Year'] - 1984, axis=1)

# # Reorder columns
# df_clean = df_clean[['Team', 'Time', 'Rank', 'Year']]

# # Verify the first few rows to check the mapping
# df_clean

# # Export to CSV
# df_clean.to_csv('ultimate_standings_clean.csv', index=False)
# print("Data exported to 'ultimate_standings_clean.csv'")

In [84]:
def transform_rank_format(df, column_name):
    def convert_format(rank):
        # If it's already in T# format, keep it
        if isinstance(rank, str) and rank.startswith('T'):
            return rank
        
        # If it ends with T, convert to T# format
        if isinstance(rank, str) and rank.endswith('T'):
            number = rank[:-1]  # Remove the T
            return f'T{number}'
        
        return rank  # Return unchanged if no T

    # Apply the conversion to the specified column
    df[column_name] = df[column_name].apply(convert_format)
    
    return df

In [85]:
# Create new dataframe without URL and Team columns
df_T = df.drop(['URL', 'Team'], axis=1)

# Apply the conversion to the specified column
df = transform_rank_format(df_T, 'Web_Finish')

# Rename Team_Clean to Team for clarity
df_T = df_T.rename(columns={'Team_Clean': 'Team', 'List_Finish': 'Rank', 'Web_Finish': 'T_Rank'})

# Reorder columns
df_T = df_T[['Team', 'Year', 'Rank', 'T_Rank']]

# Manually change a rank value for UNC Wilmington in 2021
df_T.loc[(df_T['Team'] == 'UNC Wilmington') & (df_T['Year'] == 2021), 'T_Rank'] = 'T9'

# Verify the first few rows to check the mapping
df_T

# Export to CSV
df_T.to_csv('ultimate_standings_T.csv', index=False)
print("Data exported to 'ultimate_standings_T.csv'")

Data exported to 'ultimate_standings_T.csv'


In [86]:
# # Read the CSV
# df_with_less_columns_for_tableau = pd.read_csv('ultimate_standings_clean.csv')

# # Group by year and check rank sequences
# found_issues = False
# for year in sorted(df_with_less_columns_for_tableau['Year'].unique()):
#     year_data = df_with_less_columns_for_tableau[df_with_less_columns_for_tableau['Year'] == year].sort_values('Rank')
#     ranks = year_data['Rank'].tolist()
#     expected_ranks = list(range(1, len(ranks) + 1))
    
#     if ranks != expected_ranks:
#         found_issues = True
#         print(f"\nYear {year} has non-sequential ranks:")
#         print("Expected:", expected_ranks)
#         print("Actual:", ranks)
#         print("\nFull data for this year:")
#         print(year_data[['Team', 'Rank']].to_string())

# if not found_issues:
#     print("All years have sequential ranks without gaps! ✓")

In [87]:
df

Unnamed: 0,Year,Web_Finish,List_Finish,Team_Clean
0,1984,1,1,Stanford
1,1984,2,2,Glassboro
2,1984,T3,3,Massachusetts
3,1984,T3,4,Penn
4,1984,?,5,Chabot Community College
...,...,...,...,...
615,2023,T15,16,Washington
616,2023,T17,17,Tufts
617,2023,T17,18,Utah State
618,2023,T19,19,Cornell


In [88]:
# # Make a big df with logo urls

# # Read in the csv with the team names and urls
# team_logos_df = pd.read_csv('team_logos.csv')

# team_logos_df

# # Remove duplicates from team_logos_df before merging
# team_logos_df_unique = team_logos_df.drop_duplicates('team_name')

# team_logos_df_unique

In [89]:
# # Merge the two dfs on the team name

# # This will:
# # Match rows where df['Team_Clean'] equals team_logos_df['team_name']
# # Keep all rows from df (due to left join)
# # Add all columns from team_logos_df
# # Optionally remove the duplicate team name column

# # Merge using left_on and right_on to specify the different column names
# df_merged = pd.merge(
#     df, 
#     team_logos_df_unique, 
#     left_on='Team_Clean',
#     right_on='team_nam
#     how='left'
# )

# df_merged

In [90]:
# # Export the merged DataFrame to a CSV file
# df_merged.to_csv('ultimate_standings_merged.csv', index=False)
# print("Data exported to 'ultimate_standings_merged.csv'")

In [91]:
# # Add missing logos manually

# teams_with_missing_logos = [
#     #'Arizona',
#     #'Boston College',
#     #'Carnegie Mellon',
#     #'Case Western Reserve',
#     #'Chabot Community College',
#     'Colorado College',
#     'Cornell',
#     'Dartmouth',
#     'Delaware',
#     'East Carolina',
#     'Eastern Michigan',
#     'Florida State',
#     'George Washington',
#     'Glassboro',
#     'Indiana',
#     'Iowa',
#     'Kansas',
#     'Las Positas College',
#     'Luther',
#     'Michigan State',
#     'Middlebury',
#     'Minnesota-Duluth',
#     'North Texas',
#     'Notre Dame',
#     'Oberlin',
#     'Ohio',
#     'Penn',
#     'Princeton',
#     "Queen's",
#     'Rice',
#     'Rutgers',
#     'Saint Louis',
#     'Salisbury',
#     'Suny-Albany',
#     'Suny-Binghampton',
#     'Suny-Purchase',
#     'Sw Missouri State',
#     'Swarthmore',
#     'Syracuse',
#     'Texas State',
#     'UC Davis',
#     'Utah',
#     'Virginia',
#     'Wesleyan',
#     'Whitman',
#     'Williams',
#     'Wilmington',
#     'Winona State',
#     'Yale'
# ]

In [92]:
# # Get unique teams and sort them
# unique_teams = sorted(df_merged['Team_Clean'].unique())


# # Print them to verify
# for team in unique_teams:
#     print(team)

In [3]:
# Adding team results manually, for 2024

# Read in the existing CSV
df_T = pd.read_csv('ultimate_standings_T.csv')

# Remove any existing 2024 entries
df_T = df_T[df_T['Year'] != 2024]

# Create new data
new_data = {
    'Team': [
        'Brown', 'Cal Poly SLO', 'Colorado', 'North Carolina', 'Georgia',
        'Minnesota', 'NC State', 'Oregon', 'Massachusetts', 'Pittsburgh',
        'Texas', 'Michigan', 'Oregon State', 'California', 'Vermont',
        'WashU', 'Penn State', 'Carleton', 'Alabama-Huntsville', 'Ottawa'
    ],
    'Year': [2024] * 20,
    'Rank': list(range(1, 21)),
    'T_Rank': [
        '1', '2', 'T3', 'T3', 'T5', 'T5', 'T5', 'T5', 'T9', 'T9',
        'T9', 'T9', 'T13', 'T13', 'T15', 'T15', 'T17', 'T17', 'T19', 'T19'
    ]
}

# Convert to DataFrame
new_df = pd.DataFrame(new_data)

# Append to existing DataFrame
df_T = pd.concat([df_T, new_df], ignore_index=True)

# Export to CSV
df_T.to_csv('ultimate_standings_T.csv', index=False)

In [10]:
# Read in the mens and womens csvs
df_mens = pd.read_csv('college-mens-rankings.csv')
df_womens = pd.read_csv('college-womens-rankings.csv')

# Look at the Team column and see if there are any mismatches between men and women
mens_teams = set(df_mens['Team'].unique())
womens_teams = set(df_womens['Team'].unique())

print("Teams in men's but not women's:")
print(mens_teams - womens_teams)
print("\nTeams in women's but not men's:")
print(womens_teams - mens_teams)

# Define standardization dictionary
name_standardization = {
    'North Carolina-Wilmington': 'UNC Wilmington',
    'North Carolina- Wilmington': 'UNC Wilmington',
    'Pennsylvania': 'Penn',
    'Washington University': 'WashU',
    'North Carolina State': 'NC State',
    'Cal Poly-SLO': 'Cal Poly SLO',
    'Cal Poly- SLO': 'Cal Poly SLO',
    'SUNY-Binghamton': 'SUNY Binghamton',
    'Texas-Dallas': 'UT Dallas',
    'Southern California': 'USC'
}

# Apply standardization to both datasets
df_mens['Team'] = df_mens['Team'].replace(name_standardization)
df_womens['Team'] = df_womens['Team'].replace(name_standardization)

# Add a 'Division' column to both
df_mens['Division'] = "College Men's"
df_womens['Division'] = "College Women's"

# Concatenate both datasets
df_combined = pd.concat([df_mens, df_womens], ignore_index=True)

# Sort by Year, Division, and Rank
df_combined = df_combined.sort_values(['Division','Year', 'Rank'])

# Export to csv called college-rankings-combined.csv
df_combined.to_csv('college-rankings-combined.csv', index=False)

# Print summary
print("\nCombined dataset shape:", df_combined.shape)
print("\nYears covered:", sorted(df_combined['Year'].unique()))
print("\nTotal number of teams:", len(df_combined['Team'].unique()))

Teams in men's but not women's:
{'Texas State', 'Auburn', 'Cincinnati', 'George Washington', 'Boston College', 'Case Western Reserve', 'Oregon State', 'WashU', 'LSU', 'Utah State', "Queen's", 'Virginia Tech', 'Penn', 'UNC Wilmington', 'SW Missouri State', 'Glassboro', 'Salisbury', 'Minnesota-Duluth', 'NC State', 'Alabama-Huntsville', 'Eastern Michigan', 'Chabot Community College', 'Tulane', 'Cal Poly SLO', 'Winona State', 'Georgetown', 'SUNY Albany', 'Syracuse', 'North Texas', 'Las Positas College', 'Luther', 'SUNY Purchase'}

Teams in women's but not men's:
{'Purdue', 'Sonoma State', 'Claremont', 'North Carolina- Wilmington', 'Smith', 'Texas-Dallas', 'Southern California', 'North Carolina-Wilmington', 'Bucknell', 'West Chester', 'North Carolina State', 'UCLA', 'Truman State', 'Washington University', 'Emory', 'Towson', 'Earlham', 'Pennsylvania', 'Wake Forest', 'Humboldt State', 'Chicago', 'SUNY-Binghamton', 'NYU', 'Boston University'}

Combined dataset shape: (1212, 5)

Years covered: