In [36]:
import requests
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import re

In [5]:
import os 
IMAGE_DIR = "./data/images/"
os.chdir(IMAGE_DIR)

In [33]:
headers = {
    "User-Agent": "UniversitySchoolProjectImageScraper/1.0 corkr933@student.liu.se"
}

In [34]:
def download_image(url, filename):
    try:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()  

        with open(filename, 'wb') as f:
            for block in response.iter_content(1024):
                if not block:
                    break
                f.write(block)

        print(f"Image downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading image: {e}")

In [30]:
def extract_cc_info(file_url):
    response = requests.get(f"https://en.wikipedia.org{file_url}")
    soup = BeautifulSoup(response.content, 'html.parser')
    user_element = soup.find('a', title=lambda t: t and t.startswith('User:'))
    username = user_element.text.strip() if user_element else "Unknown"
    license_info = "CC_BY-SA_4.0"
    return username, license_info

In [39]:
def scrape_images(url, player_name):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    image_links = soup.find_all('a', class_='mw-file-description')

    if not image_links:  
        print(f"No image found for {player_name}. Skipping...")
        return

    for image_link in image_links:
        image_tag = image_link.find('img', class_='mw-file-element')
        if image_tag:
            image_url = image_tag['src']
            if image_url.startswith("//"):
                image_url = "https:" + image_url 
            
            file_url = image_link['href']
            username, license_info = extract_cc_info(file_url)

            filename = f"{player_name}_{username}_{license_info}.jpg"
            filename = filename.replace(" ", "_")  
            filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
            if os.path.isfile(os.path.join(IMAGE_DIR, filename)):
                print(f"Image for {player_name} already exists. Skipping...")
            else:
                download_image(image_url, filename)
            break  # Exit the loop after downloading the first image

In [26]:
def scrape_images_from_csv(csv_file):
    df = pd.read_csv(csv_file)

    for _, row in df.iterrows():
        player_name = row['Player']
        url_to_scrape = f"https://en.wikipedia.org/wiki/{player_name.replace(' ', '_')}"  
        scrape_images(url_to_scrape, player_name)

In [38]:
csv_filename = "../euro_player_data.csv"  
scrape_images_from_csv(csv_filename)

Image downloaded: Bernd_Leno_Granada_CC_BY-SA_4.0.jpg
Image downloaded: Marc-André_ter_Stegen_Unknown_CC_BY-SA_4.0.jpg
Image downloaded: Oliver_Baumann_Silesia711_CC_BY-SA_4.0.jpg
Image downloaded: Antonio_Rüdiger_Granada_CC_BY-SA_4.0.jpg
No image found for David Raum. Skipping...
Image downloaded: Jonathan_Tah_Unknown_CC_BY-SA_4.0.jpg
Image downloaded: Joshua_Kimmich_Steffen_Prößdorf_CC_BY-SA_4.0.jpg
No image found for Robin Koch. Skipping...
Image downloaded: Waldemar_Anton_Jeollo_CC_BY-SA_4.0.jpg
Image downloaded: Maximilian_Mittelstädt_Steindy_CC_BY-SA_4.0.jpg
Image downloaded: Benjamin_Henrichs_Fuguito_CC_BY-SA_4.0.jpg
Image downloaded: Pascal_Groß_FlickreviewR_2_CC_BY-SA_4.0.jpg
Image downloaded: Toni_Kroos_Unknown_CC_BY-SA_4.0.jpg
Image downloaded: Jamal_Musiala_ArsenalGhanaPartey_CC_BY-SA_4.0.jpg
Image downloaded: Chris_Führich_Jeollo_von_VfB-exklusiv.de_CC_BY-SA_4.0.jpg
Image downloaded: Florian_Wirtz_Pyaet_CC_BY-SA_4.0.jpg
No image found for İlkay Gündoğan (captain). Skipping