In [1]:
# import relevant packages and modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import InvalidSelectorException, NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import pygetwindow as gw
import time 

In [2]:
chrome_driverpath = "C:/chromedriver-win64/chromedriver.exe"
service = Service(executable_path=chrome_driverpath)
driver1 = webdriver.Chrome(service=service) 

website1= "https://www.transfermarkt.com/manchester-united/alumni/verein/985#google_vignette"
driver1.get(website1)
driver1.maximize_window()

In [3]:
# Wait setup
wait = WebDriverWait(driver1, 10)

try:
    # Wait for the iframe using @id 
    iframe = wait.until(EC.presence_of_element_located((By.XPATH, '//iframe[contains(@id, "sp_message")]')))
    driver1.switch_to.frame(iframe)
    print("✅ Switched to iframe.")

    # Wait for the Accept button and click it
    accept_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All") or contains(text(), "Accept")]')))
    accept_btn.click()
    print("✅ Accept button clicked.")

    # Return to main content
    driver1.switch_to.default_content()
    print("✅ Switched back to main content.")

except TimeoutException:
    print("⚠️ Timed out waiting for iframe or accept button — maybe already accepted or not shown.")

# Continue scraping here...


✅ Switched to iframe.
✅ Accept button clicked.
✅ Switched back to main content.


In [8]:
detailed_button = wait.until(EC.element_to_be_clickable((By.XPATH, '(//div[@class="tm-tabs"]/a/div/span)[2]')))
detailed_button.click()
print("✅ On Detailed Page")

✅ On Detailed Page


In [9]:
def close_google_ad_popup(driver, timeout=10):
    try:
        wait = WebDriverWait(driver, timeout)
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        
        for iframe in iframes:
            try:
                driver.switch_to.frame(iframe)
                dismiss_selectors = [
                    (By.XPATH, '//div[contains(@class, "close") or contains(@class, "dismiss")]'),
                    (By.XPATH, '//button | //div[@role="button"][contains(text(), "Close") or contains(text(), "Dismiss")]'),
                ]
                
                for by, selector in dismiss_selectors:
                    try:
                        dismiss_btn = wait.until(EC.element_to_be_clickable((by, selector)))
                        dismiss_btn.click()
                        return True
                    except TimeoutException:
                        continue
                driver.switch_to.default_content()
            except NoSuchFrameException:
                driver.switch_to.default_content()
                continue
        return False
    except Exception as e:
        print(f"⚠️ Error closing pop-up: {str(e)}")
        return False
    finally:
        driver.switch_to.default_content()

In [10]:
all_names, all_positions, all_countries, all_dobs, all_teams = [], [], [], [], []

In [11]:
for page in range(1, 50):
    print(f"On page {page}")
    
    # # Try to close any pop-up
    # try:
    #     close_google_ad_popup(driver1, timeout=10)
    # except Exception as e:
    #     print(f"⚠️ Failed to handle pop-up on page {page}: {e}")
    
    # Scrape data
    try:
        # Player names
        player_names = driver1.find_elements(By.XPATH, '//tbody/tr/td[1]/table[1][@class="inline-table"]/tbody/tr/td[2]')
        # Position
        player_positions = driver1.find_elements(By.XPATH, '//tbody/tr/td[1]/table[1][@class="inline-table"]/tbody/tr[2]/td')
        # Country
        player_countries = driver1.find_elements(By.XPATH, '//table[@class="items"]/tbody/tr/td[5]/img')
        # Age
        player_dobs = driver1.find_elements(By.XPATH, '//*[@id="yw1"]/table/tbody/tr/td[3]')
        # Teams
        player_teams = driver1.find_elements(By.XPATH, '//table[@class="items"]/tbody/tr/td[8]')

        # Append data
        all_names.extend([n.text for n in player_names])
        all_positions.extend([p.text for p in player_positions])
        all_countries.extend([c.get_attribute('title') for c in player_countries])
        all_dobs.extend([d.text for d in player_dobs])
        all_teams.extend([t.text for t in player_teams])

        print(f"✅ Page {page} scraped: {len(player_names)} players.")
    except Exception as e:
        print(f"❌ Error extracting data on page {page}: {e}")
    
    # Navigate to next page
    try:
        wait = WebDriverWait(driver1, 10)
        next_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@title="Go to the next page"]')))
        driver1.execute_script("arguments[0].click();", next_button)
    except Exception as e:
        print(f"❌ Next button error on page {page}: {e}")
        break
    
    time.sleep(20)  # Reduced from 20s; adjust if needed

On page 1
✅ Page 1 scraped: 25 players.
On page 2
✅ Page 2 scraped: 25 players.
On page 3
✅ Page 3 scraped: 25 players.
On page 4
✅ Page 4 scraped: 25 players.
On page 5
✅ Page 5 scraped: 25 players.


In [64]:
# Create DataFrame
if len(all_names) == len(all_positions) == len(all_countries) == len(all_dobs) == len(all_teams):
    df = pd.DataFrame({
        'Name': all_names,
        'Position': all_positions,
        'Country': all_countries,
        'DOB': all_dobs,
        'Team': all_teams
    })
    print("✅ DataFrame created successfully")
    print(df.head())
else:
    print("❌ Error: Lists have unequal lengths, cannot create DataFrame")
    print(f"Lengths: Names={len(all_names)}, Positions={len(all_positions)}, Countries={len(all_countries)}, DOBs={len(all_dobs)}, Teams={len(all_teams)}")

✅ DataFrame created successfully
                Name          Position   Country           DOB  \
0  Cristiano Ronaldo    Centre-Forward  Portugal   Feb 5, 1985   
1     Raphaël Varane       Centre-Back    France  Apr 25, 1993   
2         Paul Pogba  Central Midfield    France  Mar 15, 1993   
3       Jadon Sancho       Left Winger   England  Mar 25, 2000   
4       Gerard Piqué       Centre-Back     Spain   Feb 2, 1987   

                                   Team  
0                            First Team  
1                            First Team  
2  First Team/Reserves Team/Youth Teams  
3                            First Team  
4                            First Team  


In [63]:
print(len(all_names))
print(len(all_positions))
print(len(all_countries))
print(len(all_dobs))
print(len(all_teams))

1225
1225
1225
1225
1225


In [66]:
def has_duplicates(lst):
    return len(lst) != len(set(lst))
print(has_duplicates(all_names)) # Output: True

True


In [72]:
df["Name"].duplicated().sum()

7

In [74]:
df[df["Name"]=='Cristiano Ronaldo']

Unnamed: 0,Name,Position,Country,DOB,Team
0,Cristiano Ronaldo,Centre-Forward,Portugal,"Feb 5, 1985",First Team


In [77]:
df.to_csv("MUN_Players_Alumni_May2025.csv",index= False)