In [55]:
# Load necessary libraries
suppressPackageStartupMessages({
    library(rvest)
    library(tidyverse)
    library(stringr)
    library(worldfootballR)
    library(purrr)
})

https://www.appsilon.com/post/webscraping-dynamic-websites-with-r

https://github.com/JaseZiv/worldfootballR/blob/main/R/player_transfer_history.R#L35

In [2]:
# Base URL
base_url <- "https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop"

# Initialize empty vectors
players <- c()
profile_links <- c()
market_values <- c()

# Step 1: Loop through all pages (8 pages for 200 players)
for (page_num in 1:8) {
  # Construct the URL for each page
  url <- ifelse(page_num == 1, base_url, paste0(base_url, "?page=", page_num))
  
  # Read the page
  page <- read_html(url)
  
  # Extract all rows in the table
  rows <- page %>% html_nodes("table.items tbody tr")
  
  # Loop through rows to extract data
  for (row in rows) {
    # Extract player profile link
    player_node <- row %>% html_node("td.hauptlink a")
    player_name <- player_node %>% html_text(trim = TRUE)
    player_link <- player_node %>% html_attr("href")
    
    # Extract market value
    market_value_node <- row %>% html_node("td.rechts.hauptlink")
    market_value <- market_value_node %>% html_text(trim = TRUE)
    
    # Append data if player name exists
    if (!is.na(player_name) && player_name != "") {
      players <- c(players, player_name)
      profile_links <- c(profile_links, paste0("https://www.transfermarkt.com", player_link))
      market_values <- c(market_values, market_value)
    }
  }
  
  # Optional: Print progress
  print(paste("Page", page_num, "completed"))
}

# Step 2: Combine into a Data Frame
top_players <- data.frame(
  Player = players,
  Profile_Link = profile_links,
  Market_Value = market_values
)

# Step 3: Clean the data (there are duplicates for each player but Market_Value is NA for the duplicate rows)
top_players <- top_players %>%
    filter(!is.na(Market_Value))

[1] "Page 1 completed"
[1] "Page 2 completed"
[1] "Page 3 completed"
[1] "Page 4 completed"
[1] "Page 5 completed"
[1] "Page 6 completed"
[1] "Page 7 completed"
[1] "Page 8 completed"


Next is to code two functions that extracts the relevant data we want.

In [3]:
# Function to extract first club and the clubs country
get_first_club_and_country <- function(player_url) {
    # Get the transfer history
    transfer_history <- tm_player_transfer_history(player_url = player_url)

    # Handles the issue of NA by filling missing country_to with
    # the first non-NA country_from
    first_valid_country_from <- transfer_history %>%
        filter(!is.na(country_from)) %>%
        slice(1) %>%
        pull(country_from)

    transfer_history <- transfer_history %>%
        mutate(
            country_to = ifelse(is.na(country_to),first_valid_country_from, country_to)
        )
    
    # Extract the first recorded transfer
    first_recorded_transfer <- transfer_history %>%
        arrange(transfer_date) %>%
        slice(1) %>%
        select(team_from, country_to)

    # Return the 1x2 tibble containing team_from and country_to
    return(first_recorded_transfer)
}

# Function to extract player information
get_player_info <- function(player_url) {
    # Get player bio
    player_bio <- tm_player_bio(player_url = player_url)

    # Select necessary information about the player
    player_info <- player_bio %>%
        select(citizenship, position, foot, current_club)
    
    # Return the 1x4 tibble
    return(player_info)
}

In [4]:
unique_profile_urls <- unique(profile_links) # This was needed, because profile_links had duplicates

In [5]:
# Apply both functions to all profile urls and combine the results
# into a single dataframe using map_dfr()
first_club_data <- map_dfr(unique_profile_urls, get_first_club_and_country)
player_info_data <- map_dfr(unique_profile_urls, get_player_info)

Next step is to join the data. We will use the player urls to correctly join the datasets.

In [70]:
# Add Profile_Link column to the two datasets
first_club_data <- first_club_data %>%
    mutate(Profile_Link = unique_profile_urls)

player_info_data <- player_info_data %>%
    mutate(Profile_Link = unique_profile_urls)

# Join all data with top_players
final_data <- top_players %>%
    left_join(player_info_data, by = "Profile_Link") %>%
    left_join(first_club_data, by = "Profile_Link")

There are only two players who has NA in the column `country_to`. After further investigation, this is due to the player only been associated with clubs residing in one country. The two players in the dataframe are Lamine Yamal and Gavi, both of which only has been associated with Barcelona.

Thus we fill in these values manually.

In [71]:
final_data <- final_data %>%
    mutate(
        country_to = case_when(
            Player == "Gavi" ~ "Spain",
            Player == "Lamine Yamal" ~ "Spain",
            TRUE ~ country_to
        )
    )

After looking through the dataframe I found that some cell values in `country_to` are incorrect. This is due to the raw data and we need to manually adjust it, by creating a Club-to-Country Mapping. 

In [72]:
# Manual club-to-country mapping
club_country_mapping <- data.frame(
    team_from = c("Godset Yth.", "IDV U20", "Tháder Youth", "Argentinos U20", "River Plate U20", "CA Atalaya",
                  "NK Tresnjevka", "Videoton Jgd.", "Ult. Strikers", "Sporting Yth.", "IFK AT U19", "ACBB Youth",
                  "Mariadorf Yth.", "Rentistas U19", "Peñarol Yth.", "Belgrano U20", "Hörsholm U IK", "Desportivo U20",
                  "Beerschot Yth", "IF Bromma U17", "Avaí U20", "São Paulo U17", "Real Madrid U18", "Standard Liège",
                  "Espanyol Youth", "Benfica Yth", "Casa Sports", "WSV '30 Yth.", "Vianense For.", "Zulte W. Youth",
                  "Zeeburgia Yth.", "Atlético Yth.", "Reading Yth.", "NK Krsko U17", "Desportivo U20", "Right to Dream",
                  "Coritiba U20", "Newell's U20", "CA Fénix U19", "Whitecaps Res", "Frontale Jgd.", "AZ Alkmaar Yout", 
                  "Paris FC Yth.", "VAFC U19", "PSG Youth", "Gondomar SC CJ", "PA Classics", "SV Sallern Jgd.", "Real Sapphire", 
                  "Génération Foot", "Rayo U19", "Ottawa ISC", "Frontale U18", "Dooryong ES", "Gagra Acad.", "FCK Youth", "Pancratius Yth.", 
                  "Abcoude Yth.", "Wattensch. Yth.", "Spartaan Yth.", "OS Abobo", "St. Kevins Boys", "Anderlecht Yth.", "OH Leuven Yth.", "Salitas", 
                  "Tottenham Yth.", "FC Porto Youth", "Recife U17", "Málaga CF Youth", "J3 Amilly Jgd.", "Cornellà Youth", "GC Zürich Jgd.", "Alta Lisboa S15", 
                  "Valencia CF You", "PSV Mannh. Yth.", "Club Brugge Yth"),
    
    country_to = c("Norway", "Ecuador", "Spain", "Argentina", "Argentina", "Argentina", "Croatia",
                   "Hungary", "Nigeria", "Portugal", "Sweden", "France", "Germany", "Uruguay", "Uruguay",
                   "Argentina", "Denmark", "Brazil", "Belgium", "Sweden", "Brazil", "Brazil", "Spain", "Belgium",
                   "Spain", "Portugal", "Senegal", "Netherlands", "Portugal", "Belgium", "Netherlands", "Spain",
                   "England", "Slovenia", "Brazil", "Ghana", "Brazil", "Argentina", "Uruguay", "Canada", "Japan",
                   "Netherlands", "France", "France", "France", "Portugal", "USA", "Germany", "Nigeria", "Senegal", 
                   "Spain", "Canada", "Japan", "Korea, South", "Georgia", "Denmark", "Netherlands", "Netherlands", "Germany",
                   "Netherlands", "Ivory Coast", "Ireland", "Belgium", "Belgium", "Burkina Faso", "England", "Portugal", "Brazil", 
                   "Spain", "France", "Spain", "Switzerland", "Portugal", "Spain", "Germany", "Belgium")
)


# Clean Whitespaces
final_data$team_from <- trimws(final_data$team_from)
club_country_mapping$team_from <- trimws(club_country_mapping$team_from)

# Join with Mapping and update ´country_to´
final_data <- final_data %>%
    left_join(club_country_mapping, by = "team_from", suffix = c("", "_mapped"), relationship = "many-to-many") %>%
    mutate(
        # Always prioritize mapped values from the mapping
        country_to = ifelse(!is.na(country_to_mapped), country_to_mapped, country_to)
        ) %>%
    select(-country_to_mapped)  # Remove the temporary mapped column

We remove any dual citizenship displayed. By doing so, this column will represent the country they play for. After which we can rename the columns and then save the dataframe.

In [73]:
# Adjust citizenship to extract only the first country
final_data <- final_data %>%
    mutate(
        citizenship = ifelse(
            str_detect(citizenship, " "),
            str_split(citizenship, " ") %>% map_chr(~ .[1]),
            citizenship
        )
    )

# Change the column names
colnames(final_data) <- c("player", "url", "mv", "citizenship", "position", "foot", "current_club", "youth_club", "youth_club_country")

In [79]:
length(unique(final_data$youth_club_country))