In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [46]:
def process_page_of_players(url):
    """
    This function takes a URL for a fifa index page and returns the list of players and their numeric ID
    """
    # first we get the page
    response = requests.get(url)
    
    # then we find the table with the player names in it
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('tbody')
    rows = table.findAll('tr')
    
    # then we read each row of the table and extract the name and the player id and add it to our list
    players_and_ids = []
    for row in rows:
        player_id = row.get_attribute_list('data-playerid')[0]
        name = row.find('a').get_attribute_list('title')[0]
        players_and_ids.append((name, player_id))
        
    # we finally return the list of players
    return players_and_ids

In [188]:
def process_skill_cards(soup):
    # the skills are separated into 'cards' of similar skills, so we need to get each card and read the skills listed
    cards = soup.findAll('div', {'class':'col-12 col-md-4 item'})
    skill_lists = {}
    for card in cards:
        # once we've got a card we read each line of it separately
        skills = card.findAll('p')
        for skill in skills:
            if len(skill.findAll('span')) > 0:
                # some skills are numeric - e.g. Sprint Speed is 87
                skill_value = skill.findAll('span')[1].text
                skill_name = skill.text[:-len(skill_value)-1]
            else:
                # but some are just characteristics - e.g good dribbler
                # for these guys we give them a value of 1, as it's a boolean classification
                skill_value = 1
                skill_name = skill.text
            skill_lists[skill_name] = skill_value
                
    return skill_lists

def process_main_card(soup):
    # the main skills section is all annoying custom stuff, so we've got to process each one individually
    cards = soup.findAll('div', {'class':'card mb-5'})
    main_player_card = cards[1]
    
    # the rating and potential are set up in the format [NAME] [RATING] [POTENTIAL]
    rating, potential = main_player_card.find('h5').findAll('span')[0].text.split(' ')
    
    # The age is just a number in the fifth row
    age = main_player_card.findAll('p')[4].findAll('span')[0].text
    
    # The work rates are in the form [ATTACK WR] / [DEFENSE WR], so we split on the spaces and just ignore the middle slash
    attacking_work_rate, _, defensive_work_rate = main_player_card.findAll('p')[6].findAll('span')[0].text.split(' ')
    
    # Weak foot and skill moves are in the form of stars out of 5. the "fas" in the class indicates a filled star. we just count how many filled stars there are
    weak_foot = len(main_player_card.findAll('p')[7].findAll('i', {'class':'fas fa-star fa-lg'}))
    skill_moves = len(main_player_card.findAll('p')[8].findAll('i', {'class':'fas fa-star fa-lg'}))
    
    skill_lists = {
        'rating':rating,
        'potential':potential,
        'age':age,
        'attacking_work_rate':attacking_work_rate,
        'defensive_work_rate':defensive_work_rate,
        'weak_foot':weak_foot,
        'skill_moves':skill_moves
    }
    
    return skill_lists

def process_player_page(player_url):
    """
    This function takes a url and gets the skills listed on the fifa stats player page
    """
    # get the web page
    response = requests.get(player_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # extract the skill values
    skill_cards = process_skill_cards(soup)
    main_card = process_main_card(soup)
    combined_skills = {**main_card, **skill_cards}
    
    return pd.Series(combined_skills)
    

In [50]:
list_of_players_and_ids = []
# I don't know how many pages of players there are, so I just set a limit of 20
max_n_pages_to_check = 20

for i in range(1, max_n_pages_to_check + 1):
    epl_players_url = f'https://www.fifaindex.com/players/{i}/?league=13&order=desc'
    print(f'downloaded {i} of {max_n_pages_to_check}')
    # We read each web page and extract the list of players
    new_players = process_page_of_players(epl_players_url)
    # And then update our list
    list_of_players_and_ids = list_of_players_and_ids + 

downloaded 1 of 20
downloaded 2 of 20
downloaded 3 of 20
downloaded 4 of 20
downloaded 5 of 20
downloaded 6 of 20
downloaded 7 of 20
downloaded 8 of 20
downloaded 9 of 20
downloaded 10 of 20
downloaded 11 of 20
downloaded 12 of 20
downloaded 13 of 20
downloaded 14 of 20
downloaded 15 of 20
downloaded 16 of 20
downloaded 17 of 20
downloaded 18 of 20
downloaded 19 of 20


In [192]:
# Now we've got the id's we can download individual player level data
i = 0 
for name, id in list_of_players_and_ids:
    player_url = f'https://www.fifaindex.com/player/{id}'
    # Download and process the data
    player_data = process_player_page(player_url)
    
    # Save it as a CSV
    player_data.to_csv(rf'players/{name.replace(" ", "_")}.csv')
    
    # This bit just prints a status update every 10 players
    if i % 10 == 0 :
        print(f'{i}  out of {len(list_of_players_and_ids)}')
    i += 1

0  out of 570
10  out of 570
20  out of 570
30  out of 570
40  out of 570
50  out of 570
60  out of 570
70  out of 570
80  out of 570
90  out of 570
100  out of 570
110  out of 570
120  out of 570
130  out of 570
140  out of 570
150  out of 570
160  out of 570
170  out of 570
180  out of 570
190  out of 570
200  out of 570
210  out of 570
220  out of 570
230  out of 570
240  out of 570
250  out of 570
260  out of 570
270  out of 570
280  out of 570
290  out of 570
300  out of 570
310  out of 570
320  out of 570
330  out of 570
340  out of 570
350  out of 570
360  out of 570
370  out of 570
380  out of 570
390  out of 570
400  out of 570
410  out of 570
420  out of 570
430  out of 570
440  out of 570
450  out of 570
460  out of 570
470  out of 570
480  out of 570
490  out of 570
500  out of 570
510  out of 570
520  out of 570
530  out of 570
540  out of 570
550  out of 570
560  out of 570
