In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
import os 
IMAGE_DIR = "./data/images/"
os.chdir(IMAGE_DIR)

In [3]:
headers = {
    "User-Agent": "UniversitySchoolProjectImageScraper/1.0 corkr933@student.liu.se"
}

In [4]:
def download_image(url, filename):
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()  

        with open(filename, 'wb') as f:
            for block in response.iter_content(1024):
                if not block:
                    break
                f.write(block)

        print(f"Image downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading image: {e}")

In [5]:
def extract_cc_info(file_url):
    response = requests.get(f"https://en.wikipedia.org{file_url}")
    soup = BeautifulSoup(response.content, 'html.parser')
    user_element = soup.find('a', title=lambda t: t and t.startswith('User:'))
    username = user_element.text.strip() if user_element else "Unknown"
    license_info = "CC_BY-SA_4.0"
    return username, license_info

In [39]:
def scrape_player_images(url, player_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    image_links = soup.find_all('a', class_='mw-file-description')
    if " (captain)" in player_name:
        player_name = player_name.replace(" (captain)", "")
        
    if not image_links:  
        print(f"No image found for {player_name}. Skipping...")
        return

    for image_link in image_links:
        image_tag = image_link.find('img', class_='mw-file-element')
        if image_tag:
            image_url = image_tag['src']
            if image_url.startswith("//"):
                image_url = "https:" + image_url 
            
            file_url = image_link['href']
            username, license_info = extract_cc_info(file_url)

            filename = f"{player_name}_{username}_{license_info}.jpg"
            filename = filename.replace(" ", "_")  
            filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
            if os.path.isfile(os.path.join(IMAGE_DIR, filename)):
                print(f"Image for {player_name} already exists. Skipping...")
            else:
                download_image(image_url, filename)
            break  

In [8]:
def scrape_team_images(url, team_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    image_links = soup.find_all('a', class_='mw-file-description')
        
    if not image_links:  
        print(f"No image found for {team_name}. Skipping...")
        return

    for image_link in image_links:
        image_tag = image_link.find('img', class_='mw-file-element')
        if image_tag:
            image_url = image_tag['src']
            if image_url.startswith("//"):
                image_url = "https:" + image_url 
            
            file_url = image_link['href']
            username, license_info = extract_cc_info(file_url)

            filename = f"{team_name}_{username}_{license_info}.jpg"
            filename = filename.replace(" ", "_")  
            filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
            if os.path.isfile(os.path.join(IMAGE_DIR, filename)):
                print(f"Image for {team_name} already exists. Skipping...")
            else:
                download_image(image_url, filename)
            break  

In [10]:
def scrape_images_from_csv(csv_file):
    df = pd.read_csv(csv_file)

    #for _, row in df.iterrows():
    #    player_name = row['Player']
    #    url_to_scrape = f"https://en.wikipedia.org/wiki/{player_name.replace(' ', '_')}"  
    #    scrape_player_images(url_to_scrape, player_name)
    for _, row in df.iterrows():
        team_name = row['Team']
        url_to_scrape = f"https://en.wikipedia.org/wiki/{team_name.replace(' ', '_')}_national_football_team"
        scrape_team_images(url_to_scrape, team_name)

In [11]:
csv_filename = "../euro_team_data.csv"  
scrape_images_from_csv(csv_filename)

Image downloaded: Germany_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: Scotland_Foxtrot1985_CC_BY-SA_4.0.jpg
Image downloaded: Hungary_Thommy_CC_BY-SA_4.0.jpg
Image downloaded: Switzerland_User_Marc_Mongenet_CC_BY-SA_4.0.jpg
Image downloaded: Spain_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: Croatia_Inkwina_CC_BY-SA_4.0.jpg
Image downloaded: Italy_Unknown_CC_BY-SA_4.0.jpg
Image downloaded: Albania_Xhulianoo_CC_BY-SA_4.0.jpg
Image downloaded: Slovenia_Minorax_CC_BY-SA_4.0.jpg
Image downloaded: Denmark_Unknown_CC_BY-SA_4.0.jpg
Image downloaded: Serbia_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: England_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: Poland_Denelson83_CC_BY-SA_4.0.jpg
Image downloaded: Netherlands_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: Austria_Anomie_CC_BY-SA_4.0.jpg
Image downloaded: France_Nathanlg94_CC_BY-SA_4.0.jpg
Image downloaded: Belgium_DatBot_CC_BY-SA_4.0.jpg
Image downloaded: Slovakia_S.A._Julio_CC_BY-SA_4.0.jpg
Image downloaded: Romania_RandyFitz_CC_BY-SA_4.0.jpg
Image d