In [None]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime, date, timedelta
from collections import defaultdict
import re
import json

In [None]:
# User agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17'}

# Create defaultdict
players = defaultdict(list)
teams_links = {}

In [None]:
def number_of_pages(text_to_split):
    '''
    This function extract number from text obtained from title property from "last page button"
       
    Params: 
        text_to_split(string): Contains text like "Go the last page (page 19)"
    Return:
        int : The extracted number. For instance 19
    '''
    return int(re.findall('[0-9]+', text_to_split)[0])

In [None]:
def parse_web(url, current_date):
    '''
        This function performs data extraction and create a defaultdict.
        
        Use a For loop to go through each webs and extract the name, position, age,
        current club, new club and both leagues, cost of operation and date of these operation
        about all players
        
        Params:
            url(string): request url
            current_date: datetime current date
    '''    
    r = requests.get(url, headers=headers)
    soup = BS(r.text, 'lxml')
    # Call to the function to get the number of pages of current_date date
    pages = number_of_pages(
        soup.select_one('li.tm-pagination__list-item.tm-pagination__list-item--icon-last-page > a')['title'])
    
    for x in range(1, pages):
        r = requests.get(f"https://www.transfermarkt.com/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/{current_date}/sort//plus/1/page/{x}", headers=headers)
        soup = BS(r.text, 'lxml')
        # Get the table items
        table = soup.select('table.items>tbody>tr')
        for tr in table:
            
            players['name'].append(tr.find_all('td', recursive=False)[0].select('td')[1].text.strip())
            players['position'].append(tr.find_all('td', recursive=False)[0].select('td')[2].text.strip())
            players['age'].append(tr.find_all('td', recursive=False)[1].text)
            players['origin_club'].append(tr.find_all('td', recursive=False)[3].select('td')[0].select('img')[0]['title'].strip())
            players['league_origin_club'].append(tr.find_all('td', recursive=False)[3].select('td')[2].text.strip())
            players['new_club'].append(tr.find_all('td', recursive=False)[4].select('td')[0].select('img')[0]['title'].strip())
            players['league_new_club'].append(tr.find_all('td', recursive=False)[4].select('td')[2].text.strip())
            players['cost'].append(tr.find_all('td', recursive=False)[5].text.strip())
            players['date_of_transfer'].append(current_date.strftime('%Y-%m-%d'))
        print(players)
    # Save data in json                  
    with open('Summer22_FootballTransfers.json', 'w', encoding='utf-8') as file_json:
        json.dump(players, file_json, indent=4, ensure_ascii=False)
    return players

In [None]:
def check_date(first_market_day, current_date):
    '''
        This function creates the first page to scraping for each day, since current_date to
        first_market_day.
        
        Params:
            first_market_day(datetime)
    '''
    while current_date != first_market_day:
        # Make url
        url_base = f"https://www.transfermarkt.es/transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/{current_date.strftime('%Y-%m-%d')}/page/1"
        # Call function to get data
        players_dict = parse_web(url_base, current_date)
        current_date = current_date - timedelta(days=1)
    return players_dict

In [None]:
# USE THIS FUNCTION IF YOU WANT TO DO A SCRAPING FROM A SPECIFIC DATE TO TODAY
'''def calculate_days_to_scrap(fromDay):
    today = date.today()-timedelta(days=1)
    today = datetime(today.year, today.month, today.day)
    total_days = (today - fromDay).days
    if total_days !=0:
        first_day = today - timedelta(days=(total_days))
        players_dict = check_date(first_day, today)
    return players_dict
'''

#  USE THIS FUNCTION IF YOU WANT TO DO A SCRAPING ONLY OF SUMMER MARKET WINDOW
def calculate_days_to_scrap(fromDay):
    last_day = datetime.strptime('2022-07-02', '%Y-%m-%d')
    players_dict = check_date(fromDay, last_day)   
    return players_dict

In [None]:
# Start date for scraping
players_dict = calculate_days_to_scrap(datetime.strptime('2022-06-30', '%Y-%m-%d'))

In [None]:
# Create csv
df_players = pd.DataFrame(players_dict)
df_players.to_csv('Summer22_FootballTransfers.csv', index=False)