# **Scrape Players Data (from fifaindex.com)**

**OBJECTIVE**: Using the BeautifulSoup package, scrape football teams' information about fifa attributes from the fifaindex.com website (only five major european leagues). 

#### Imports

In [None]:
from bs4 import BeautifulSoup 
import requests
import re
import time
from random import uniform
from tqdm.notebook import tqdm_notebook
import json
from random import choice
from urllib3.util import Retry
from requests.adapters import HTTPAdapter
import urllib3.exceptions
import os
import pandas as pd

## **Building Functions**

#### Make Requests

This function is necessary to make calls using different headers respecting also a sleep time between each call (good practice in web scraping applications). The function also handles some of the possible exceptions that might be risen. 

In [None]:
def make_request(pageURL):
    """Given a URL, makes requests using different headers and with a sleep time (good practice in web scraping applications). The function also handles some of the possible exceptions that might be risen. 
    Parameters:
    - pageURL: web page's URL"""
    global errors
    headers_list = [
          {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
           "Accept-Language": "en-US,en;q=0.5",
           "Referer": "https://www.google.com/",
           "DNT": "1",
           "Connection": "keep-alive",
           "Upgrade-Insecure-Requests": "1"
           },
          {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5",
               "Accept-Encoding": "gzip, deflate, br",
               "Referer": "https://www.google.com/",
               "DNT": "1",
               "Connection": "keep-alive",
               "Upgrade-Insecure-Requests": "1"
           },
          {"Connection": "keep-alive",
               "DNT": "1",
               "Upgrade-Insecure-Requests": "1",
               "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
               "Sec-Fetch-Site": "none",
               "Sec-Fetch-Mode": "navigate",
               "Sec-Fetch-Dest": "document",
               "Referer": "https://www.google.com/",
               "Accept-Encoding": "gzip, deflate, br",
               "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
           },
          {"Connection": "keep-alive",
               "Upgrade-Insecure-Requests": "1",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
               "Sec-Fetch-Site": "same-origin",
               "Sec-Fetch-Mode": "navigate",
               "Sec-Fetch-User": "?1",
               "Sec-Fetch-Dest": "document",
               "Referer": "https://www.google.com/",
               "Accept-Encoding": "gzip, deflate, br",
               "Accept-Language": "en-US,en;q=0.9"
           }]
           
    time.sleep(uniform(0, 2.25))
    req = requests.Session()
    headers = choice(headers_list)
    retry = Retry(total=8, connect=8, backoff_factor=2)
    req.mount('https://', HTTPAdapter(max_retries=retry))
     
    try:
        html = req.get(pageURL, headers=headers, timeout=12)
    except requests.packages.urllib3.exceptions.MaxRetryError:
        errors.append('requests.packages.urllib3.exceptions.MaxRetryError')
        html = requests.get('http://google.com/f')
    except urllib3.exceptions.MaxRetryError:
        errors.append('urllib3.exceptions.MaxRetryError')
        html = requests.get('http://google.com/f')
    except requests.ConnectionError:
        errors.append('requests.ConnectionError')
        html = requests.get('http://google.com/f')
    except urllib3.exceptions.ConnectionError:
        errors.append('urllib3.exceptions.ConnectionError')
        html = requests.get('http://google.com/f')
    except requests.exceptions.ConnectionError:
        errors.append('requests.exceptions.ConnectionError')
        html = requests.get('http://google.com/f')
    except Exception as ex:
        errors.append(str(ex))
        html = requests.get('http://google.com/f')
          
    return html

#### Get All Pages

Given a list of the seasons and leagues to consider, this function just create the URL of the fifaindex.com specific webpage to scrape from.

In [None]:
def get_allpages(seasons, championship):
    """Given a list of the seasons and leagues to consider, this function just creates the URL for fifaindex.com specific webpages (linked to the desired season and league).
    Parameters:
    - seasons: list formatted considering fifaindex.com season type
    - championship: list formatted considering fifaindex.com league type"""    
    global all_pages
    for j in seasons:
        for k in championship:
            Newpage_to_add = 'https://www.fifaindex.com/teams/' + j + '/?' + k + '&order=desc'
               
            all_pages.add(Newpage_to_add)
    return all_pages

#### Get Teams' Links

Given a webpage's URL, this function returns all teams' URLs included.

In [None]:
def get_teamlinks(page_url): 
    """Given a webpage's URL containing teams' links, returns all teams' URLs included.
    Parameters:
    - page_url: page URL
    """  
    global teams_links
    right_ending = re.findall('\/fifa.*\/', page_url)[0]
    html = make_request(pageURL = page_url)
    bs = BeautifulSoup(html.text, 'lxml')
    for link in bs.find_all('a', href=re.compile('^(/team/)')): 
        if 'href' in link.attrs:
            if link.attrs['href'] not in teams_links: 
                teams_links.add(re.sub('\/fifa.*\/$', right_ending, link.attrs['href'])) 
    return teams_links


#### Scrape Team's Attributes

Given a specific team webpage, this function returns the desired information using the BeautifulSoup package.

In [None]:
def scrape_team(website_page):
    """Given a specific team fifaindex.com profile, returns the desired information using the BeautifulSoup package.
    Parameters:
    - website_page: team profile on fifaindex.com
    """   
    # Request
    team_html = make_request(pageURL = website_page)
    soup = BeautifulSoup(team_html.text, 'lxml')
    # Team information 
    dd = {}
    dd['TeamName'] = (re.sub(' FIFA.+', '', soup.h1.text).strip()) # team name
    # Observation Date
    for date in soup.select('.dropdown:nth-child(3) .dropdown-toggle'):
        dd['ObservationDate'] = date.text.strip()
    # Rival team
    if soup.find('a', class_='link-team').text is not None:
        dd['RivalTeam'] = soup.find('a', class_='link-team').text
    # Team attributes
    for i in range(2,5): 
        for info in soup.select('.list-group-item:nth-child(' + str(i) + ')'):
            match = re.match("(\D+)(\d{1,2})", info.text, re.I)
            dd[match.group(1)] = match.group(2)
    # Transfer budget
    tranfer_budget = soup.find('span',class_ = 'data-currency-euro').text 
    dd['TransferBudget'] = int(re.sub(r'[.€]', '', tranfer_budget))
    # Team Attributes + Players Roles 
    list1, list2 = [], []
    for value in soup.select('.card-body p'):
        match = re.sub('\d', '', value.text)
        list1.append(match.strip())
    for i in range(1,3): # exception in 2010-2018 period
        if list1[i] == 'Passing':
            list1[i] = 'BuildupPassing'
    if list1[1] == 'Width': # exception in 2019- period
        list1[1] = 'DefensiveWidth'
    for vv in soup.select('.card-body .float-right'):
        list2.append(vv.text)
    for i in range(len(list1)):
        if re.search(list2[i], list1[i]):
            dd[re.sub(list2[i], '', list1[i]).strip()] = list2[i]
        else:
            dd[list1[i]] = list2[i]
    # Team Players + Loaned Players
    players, loaned_players = [], []
    for player in soup.select('td:nth-child(6) .link-player'):
        players.append(player.text)
    dd['TeamRoster'] = players
    for loanedplayer in soup.select('td:nth-child(4) .link-player'):
        loaned_players.append(loanedplayer.text)
    dd['LoanedPlayers'] = loaned_players

    return dd

## **Final Functions**

In [None]:
def Open_Create_file(path, seas, champ):
    """ Given the season and leagues formatted lists, return an object with all teams profile links.
    Parameters:
    - path: to check if the wanted object already exists (if it does the function just open the file)
    - seas: seasons formatted list
    - champ: leagues formatted list
    """       
    global all_pages
    if os.path.isfile(path):
        j_file = open(path)
        all_teams_pages = json.load(j_file)
        j_file.close()
    else: 
        get_allpages(seasons=seas, championship=champ)
        for page in tqdm_notebook(all_pages, total=len(all_pages)):
            all_teams_pages = get_teamlinks(page)
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(list(all_teams_pages), f, ensure_ascii=False, indent=4)
        f.close()
    return all_teams_pages

In [None]:
def get_final_list(ALL_team_player_pages):
    """Given all teams profile links, returns final object with info on every team.
    Parameters:
    - ALL_team_player_pages: every team profile link
    """  
    final_list = []
    count = 0
    for team_page in tqdm_notebook(ALL_team_player_pages, total=len(ALL_team_player_pages)): 
        try:
            final_list.append(scrape_team('https://www.fifaindex.com' + team_page))
        except ValueError:
            count += 1
            final_list.append({'ERROR': team_page})  
    return final_list

In [None]:
def final_dataframe(path, seas, champ):
    """Given the season and leagues formatted lists, returns a Dataframe with all teams infos.
    Parameters (same as Open_Create_file()):
    - path: to check if the wanted object already exists (if it does the function just open the file)
    - seas: seasons formatted list
    - champ: leagues formatted list"""
    all_teams_pages = Open_Create_file(path, seas, champ)
    final = get_final_list(all_teams_pages)
    if len(all_teams_pages) == len(final):
        print('No Error')
    else: 
        print('!!!Error!!!')
    df = pd.DataFrame(final)
    return df

# **Final Operations**

In this section all the previously presented functions are used to obtain a csv file with all teams attributes.

##### Seasons and Leagues lists

In [None]:
errors = []
all_pages = set()
teams_links = set()

## Defining Seasons and Leagues (TO CUSTOMIZED!!)

LEAGUES

In [None]:
# Major Leagues
leagues = ['league=31', 'league=13','league=16','league=19','league=53']
# Minor Leagues
minor_leagues = ['league=14', 'league=17', 'league=20', 'league=32', 'league=54']

SEASONS

In [None]:
# FOR ANNUAL DATA 
years = ['fifa10_6', 'fifa11_7','fifa12_8','fifa13_11','fifa14_12','fifa15_16','fifa16_19','fifa17_75','fifa18_175','fifa19_280','fifa20_358','fifa21_421','fifa22_487'] # If possible always 1st observation in September when the market is closed else August

# FOR WEEKLY DATA 
weeks = {'fifa10_6', 'fifa11_7', 'fifa12_8', 'fifa12_9', 'fifa13_11', 'fifa14_12', 'fifa15_16'}
for i in range(19,60): # FIFA16
    weeks.add('fifa16_' + str(i))
for j in range(74,144): # FIFA17
    weeks.add('fifa17_' + str(j))
for k in range(174, 246): # FIFA18
    weeks.add('fifa18_' + str(k))
for a in range(279, 344): # FIFA19
    weeks.add('fifa19_' + str(a))
for b in range(354,416): # FIFA20
    weeks.add('fifa20_' + str(b))
for c in range(420, 465): # FIFA21
    weeks.add('fifa21_' + str(c))
for d in range(487, 528): # FIFA22
    weeks.add('fifa22_' + str(d))

# FOR UPDATE: just add new weeks
new_weeks = ['fifa15_17']

# **RUN TO GET DATAFRAME!!**

In [None]:
df = final_dataframe(path = 'proof.json', seas=new_weeks, champ=minor_leagues)

In [None]:
df.to_csv("teams.csv", encoding='utf-8', index=False)