In [27]:
#pip install requests beautifulsoup4

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import os



In [12]:
def scrape_logo_urls(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",
            "Referer": "https://ultiworld.com/ranking/",
        }
        
        # Add a session to maintain cookies
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Add a small delay to avoid overwhelming the server
        time.sleep(2)
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Select all spans with the data-bg attribute within the specified structure
        spans = soup.select('body main div div.reference-ranking table.table tbody tr td.ranking__team span[data-bg]')
        
        # Initialize a list to store logo data
        logo_data = []
        
        for span in spans:
            # Extract the logo URL from the data-bg attribute
            logo_url = span.get('data-bg', None)
            
            # Extract the team name from the <a> tag within the span
            a_tag = span.find('a')
            team_name = a_tag.text.strip() if a_tag else 'Unknown Team'
            team_url = a_tag['href'] if a_tag and 'href' in a_tag.attrs else None
            
            # Append the extracted data to the list
            logo_data.append({
                'team_name': team_name,
                'team_url': team_url,
                'logo_url': logo_url
            })
        
        # Convert the list of dictionaries to a DataFrame
        return pd.DataFrame(logo_data)

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred while scraping {url}: {http_err}")
    except requests.RequestException as req_err:
        print(f"Request error occurred while scraping {url}: {req_err}")
    except Exception as e:
        print(f"An unexpected error occurred while scraping {url}: {e}")
    
    return None

In [13]:
scrape_logo_urls('https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/')

HTTP error occurred while scraping https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/: 403 Client Error: Forbidden for url: https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/


In [4]:
def download_logos(df, folder_name='logos', delay=0.5):
    """
    Download images from URLs stored in a DataFrame
    """
    # Create logos directory in the current working directory
    logos_dir = os.path.join(os.getcwd(), folder_name)
    os.makedirs(logos_dir, exist_ok=True)
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    # Add a column for the local file path
    df['local_path'] = ''
    
    for idx, row in df.iterrows():
        try:
            # Extract the file extension from the URL
            file_extension = os.path.splitext(row['logo_url'])[1]
            
            # Generate filename using the team name
            filename = os.path.join(logos_dir, f"{row['team_name']}{file_extension}")
            
            # Skip downloading if the file already exists
            if os.path.exists(filename):
                print(f"File already exists, skipping download: {filename}")
                df.at[idx, 'local_path'] = filename
                continue
            
            # Download the image
            response = requests.get(row['logo_url'], headers=headers)
            response.raise_for_status()
            
            # Save the image
            with open(filename, 'wb') as f:
                f.write(response.content)
            
            # Store the local path in the DataFrame
            df.at[idx, 'local_path'] = filename
            print(f"Downloaded: {filename}")
            
            # Add a small delay
            time.sleep(delay)
            
        except requests.HTTPError as http_err:
            print(f"HTTP error occurred while downloading {row['logo_url']}: {http_err}")
        except requests.RequestException as req_err:
            print(f"Request error occurred while downloading {row['logo_url']}: {req_err}")
        except Exception as e:
            print(f"An unexpected error occurred while downloading {row['logo_url']}: {e}")
    
    return df

In [5]:
# # Define the URL
# url = 'https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/'

# # Scrape the logo URLs into a DataFrame
# logo_df = scrape_logo_urls(url)

# # Check if the DataFrame is not None and has data
# if logo_df is not None and not logo_df.empty:
#     print("Scraped data:")
#     print(logo_df.head())  # Display the first few rows to verify the data

#     # Download the logos
#     logo_df = download_logos(logo_df)

#     # Save the DataFrame with local paths to a CSV file
#     logo_df.to_csv('team_logos.csv', index=False)
#     print("Logos downloaded and data saved to 'team_logos.csv'.")
# else:
#     print("No data found to download.")

In [6]:
# Define the list of URLs
urls = [
    'https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/',
    'https://ultiworld.com/ranking/143018/college-d-i-mens-rankings-end-of-2023-season/',
    'https://ultiworld.com/ranking/119526/college-d-i-mens-rankings-end-of-2022-season/',
    'https://ultiworld.com/ranking/110899/college-d-i-mens-rankings-12-1-21/',
    'https://ultiworld.com/ranking/89868/college-d-i-mens-rankings-3-11-20/',
    'https://ultiworld.com/ranking/79575/college-d-i-mens-rankings-2019-season-final/',
    'https://ultiworld.com/ranking/66334/college-d-i-mens-rankings-2018-season-week-of-5-9-18/',
    'https://ultiworld.com/ranking/61700/college-d-i-mens-rankings-2018-season-preseason/',
    'https://ultiworld.com/ranking/50496/college-d-i-mens-rankings-2017-season-end-of-fall/',
    'https://ultiworld.com/ranking/38352/college-d-i-mens-rankings-2016-season-post-ccc/',
]

# Initialize an empty DataFrame to store all logo data
all_logo_data = pd.DataFrame()

# Loop through each URL
for url in urls:
    # Scrape the logo URLs into a DataFrame
    logo_df = scrape_logo_urls(url)

    # Check if the DataFrame is not None and has data
    if logo_df is not None and not logo_df.empty:
        print(f"Scraped data from {url}:")
        print(logo_df.head())  # Display the first few rows to verify the data

        # Append the data to the all_logo_data DataFrame
        all_logo_data = pd.concat([all_logo_data, logo_df], ignore_index=True)
    else:
        print(f"No data found to download from {url}.")

HTTP error occurred while scraping https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/: 403 Client Error: Forbidden for url: https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/
No data found to download from https://ultiworld.com/ranking/161780/college-d-i-mens-rankings-6-5-24/.
HTTP error occurred while scraping https://ultiworld.com/ranking/143018/college-d-i-mens-rankings-end-of-2023-season/: 403 Client Error: Forbidden for url: https://ultiworld.com/ranking/143018/college-d-i-mens-rankings-end-of-2023-season/
No data found to download from https://ultiworld.com/ranking/143018/college-d-i-mens-rankings-end-of-2023-season/.
HTTP error occurred while scraping https://ultiworld.com/ranking/119526/college-d-i-mens-rankings-end-of-2022-season/: 403 Client Error: Forbidden for url: https://ultiworld.com/ranking/119526/college-d-i-mens-rankings-end-of-2022-season/
No data found to download from https://ultiworld.com/ranking/119526/college-d-i-mens-rank

In [7]:
all_logo_data

In [8]:
# Check if all_logo_data is not empty
if not all_logo_data.empty:
    # Convert the team names to a list
    team_names = all_logo_data['team_name'].tolist()
    
    # Print each team name on a new line with indentation
    print("Team Names List:")
    for name in team_names:
        print(f"    {name}")
else:
    print("No team data available.")

No team data available.


In [9]:
# Change all instances of a team name manually

# Method 2: Replace with a dictionary of multiple changes
name_changes = {
    'Washington University': 'WashU',
    'Pennsylvania': 'Penn',
    'California-Davis': 'UC Davis',
    'California-Santa Cruz': 'UC Santa Cruz',
    'California-San Diego': 'UC San Diego',
    'California-Santa Barbara': 'UC Santa Barbara',
    'North Carolina-Wilmington': 'UNC Wilmington',
    'Texas-Dallas': 'UT Dallas'
}
# Apply the changes
all_logo_data['team_name'] = all_logo_data['team_name'].replace(name_changes)

# Verify the changes
all_logo_data['team_name'].value_counts()

KeyError: 'team_name'

In [22]:
# If there is any data, proceed to download
if not all_logo_data.empty:
    # Download the logos
    all_logo_data = download_logos(all_logo_data)

    # Save the DataFrame with local paths to a CSV file
    all_logo_data.to_csv('team_logos.csv', index=False)
    print("Logos downloaded and data saved to 'team_logos.csv'.")
else:
    print("No data found to download from any URL.")

File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Brown.jpeg
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Cal Poly SLO.jpg
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/North Carolina.png
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Georgia.jpeg
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Colorado.png
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Oregon.png
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Minnesota.png
File already exists, skipping download: /Users/elistandard/Documents/CS_Projects/Github/ultimateDynasties/logos/Massac

In [23]:
# Add data to the dataframe

# Method 3: Add multiple rows at once
new_data = {
    'team_name': ['Carnegie Mellon', 
                  'Team2',
                  'Team3'
                  ],

    'logo_url': ['https://cdn.ultiworld.com/wordpress/wp-content/uploads/2023/02/carnegie_mellon_logo-300x300.png', 
                 'URL2', 
                 'URL3'
                 ],
    # Add other columns as needed
}
new_rows = pd.DataFrame(new_data)
all_logo_data = pd.concat([all_logo_data, new_rows], ignore_index=True)