In [None]:
#notebook for processing euro football player transfer data
#this performs two functions:

# 1. scrape data from https://www.soccernews.com/soccer-transfers/german-bundesliga-transfers/ to .csv
# 2. reformat .csv to format required by tableau
#       - convert team name to city name using lookup dictionary
#       - convert city name to geographic coordinates using Nominatim geocoding service

# the format required by tableau to plot lines on a map is:

# origin-destination | station | path ID

# each path has two entries in the final table for tableau
# we also need to find longitude and latitude here, using Nominatim
# because Tableau will find cities in the US by default (e.g. Paris, Texas)
# and these will be incorrect

In [9]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import requests
import urllib.request
from bs4 import BeautifulSoup
import time
from datetime import datetime

In [4]:
#lookup dictionary for converting team name -> home city name
team_to_city_dict = {
                "AC Milan":"Milan",\
                "Ajax":"Amsterdam",\
                "Alanyaspor":"Alanya",\
                "Al Ahli":"Jeddah, Saudi Arabia",\
                "Arsenal":"London",\
                "Asteras Tripoli":"Tripoli",\
                "Aston Villa":"Aston",\
                "Atalanta":"Bergamo",\
                "Athletic Bilbao":"Bilbao",\
                "Atletico Madrid":"Madrid",\
                "Atletico MG":"Belo Horizonte, Minas Gerais ",\
                "Atletico PR":"Curitiba, Parana, Brazil",\
                "Atletico San Luis":"San Luis",\
                "ADO Den Haag": "Den Haag",\
                "AS Roma":"Rome",\
                "Bahia":"Salvador, Bahia " ,\
                "Bayer Leverkusen":"Leverkusen",\
                "Bayern Munich":"Munich",\
                "Boca Juniors":"La Boca",\
                "Benfica":"Lisbon",\
                "Besiktas":"Istanbul",\
                "Bristol City":"Bristol",\
                "Boavista":"Porto",\
                "Borussia Dortmund":"Dortmund",\
                "Bor. Dortmund":"Dortmund",\
                "BSC Young Boys":"Bern",\
                "Celta Vigo":"Vigo",\
                "Celta de Vigo":"Vigo",\
                "Celtic":"Glasgow",\
                "Chievo Verona":"Verona",\
                "Club Brugge":"Bruges",\
                "Club Maldonado":"Maldonado",\
                "FC Copenhagen":"Copenhagen",\
                "Fiorentina":"Florence",\
                "Chievo Verona":"Verona",\
                "Club Nacional":"Montevideo, Uruguay",\
                "Crvena Zvezda":"Belgrade",\
                "Crystal Palace":"London",\
                "Deportivo":"Vitoria-Gasteiz",\
                "Deportivo Alaves":"Vitoria-Gasteiz",\
                "Deportivo La Coruña":"La Coruna",\
                "Dynamo Dresden":"Dresden",\
                "Dinamo Zagreb":"Zagreb",\
                "Eintracht Frankfurt":"Frankfurt",\
                "Espanyol":"Barcelona",\
                "Everton":"Liverpool",\
                "Excelsior":"Rotterdam",\
                "FC Barcelona":"Barcelona",\
                "FC Dallas":"Dallas",\
                "FC Koln":"Koln",\
                "FC Luzern":"Lucerne",\
                "FC Porto":"Porto",\
                "Fenerbahce":"Istanbul",\
                "Feyenoord":"Rotterdam",\
                "Flamengo":"Rio de Janeiro",\
                "Fluminense":"Laranjeiras, Rio de Janeiro, State of Rio de Janeiro, ",\
                "Fortuna Dusseldorf":"Dusseldorf",\
                "Galatasaray":"Istanbul",\
                "Guingamp":"Guingamp, Brittany",\
                "Guangzhou Evergrande":"Guangzhou, Guangdong",\
                "Gimnàstic":"Tarragona",\
                "Grasshoppers":"Zurich",\
                "Hannover 96":"Hannover",\
                "Hellas Verona":"Verona",\
                "Hertha Berlin":"Berlin, Germany",\
                "Hoffenheim":"Heidelberg",\
                "Internacional":"Porto Alegre",\
                "Instituto":"Cordoba",\
                "Inter Milan":"Milan",\
                "Ituano":"Sao Paolo",\
                "Jahn Regensburg":"Regensburg",\
                "Juventus":"Turin",\
                "Kasimpasa":"Istanbul",\
                "KRC Genk":"Genk",\
                "KV Mechelen":"Mechelen",\
                "LASK Linz":"Linz",\
                "Lazio":"Rome",\
                "Leeds United":"Leeds",\
                "Mainz 05":"Mainz",\
                "Manchester City":"Manchester",\
                "Man United":"Manchester",\
                "NAC Breda":"Breda",\
                "Napoli":"Naples",\
                "Newcastle United":"Newcastle",\
                "Nordsjaelland":"Copenhagen",\
                "Norwich City":"Norwich",\
                "Olympiacos":"Athens",\
                "Olympique Lyon":"Lyon",\
                "Ostersunds":"Ostersund",\
                "PAOK":"Thessaloniki",\
                "Partick Thistle":"Glasgow",\
                "Piast Gliwice":"Gliwice",\
                "PEC Zwolle":"Zwolle",\
                "PSG":"Paris",\
                "PSV":"Eindhoven",\
                "PSV Eindhoven":"Eindhoven",\
                "QPR":"London",\
                "Rayo Vallecano":"Madrid",\
                "Rapid Vienna":"Vienna",\
                "RB Leipzig":"Leipzig",\
                "RB Salzburg":"Salzburg",\
                "Real Betis":"Seville",\
                "Real Madrid":"Madrid",\
                "Real Madrid Castilla":"Madrid",\
                "Real Sociedad":"San Sebastian",\
                "Real Valladolid":"Valladolid",\
                "Rennais":"Rennes",\
                "Roma":"Rome",\
                "Sampdoria":"Genoa",\
                "San Lorenzo":"San Lorenzo, Argentina",\
                "Schalke 04":"Gelsenkirchen",\
                "SPAL":"Ferrara",\
                "Spal":"Ferrara",\
                "Spezia Calcio":"La Spezia",\
                "Sporting Gijon":"Gijon",\
                "Sheffield United":"Sheffield",\
                "Slavia Prague":"Prague",\
                "Spartak Moscow":"Moscow",\
                "Sporting Lisbon":"Lisbon",\
                "Sporting CP":"Lisbon",\
                "Stade Rennais":"Rennais",\
                "Standard Liege":"Liege",\
                "Steaua Bucaresti":"Bucharest",\
                "Stoke":"Stoke-on-Trent",\
                "Swansea City":"Swansea",\
                "Tigre":"Buenos Aires",\
                "Tottenham":"London",\
                "UD Las Palmas":"Las Palmas",\
                "Udinese":"Udin",\
                "Union Berlin":"Berlin",\
                "Velez Sarsfield":"Buenos Aires",\
                "Waasland-Beveren":"Beveren",\
                "Werder Bremen":"Bremen",\
                "West Brom":"West Bromwich",\
                "West Ham United":"London",\
                "Wolves":"Wolverhampton",\
                "Young Boys":"Bern",\
                "Zenit":"St. Petersburg",\
                "Zenit St. Petersburg":"St. Petersburg"}

In [5]:
#regions for Nominatim geocoding to use when determining city coordinates
#will try to find the cities in each of these, in order
regions = ["Europe", "Germany", "France", "UK", "Italy", "Spain", "Austria",\
           "Serbia", "Denmark", "Brazil", "Argentina", "Greece",  ""]

In [6]:
#scrape transfer data from https://www.soccernews.com
def scrape_to_csv(url, output_filename):
    article = requests.get(url)
    soup = BeautifulSoup(article.text, "html.parser")
    text = ""
    tags = soup.findAll("tr")
    dict_list = []
    for i in range(len(tags)):
        if(i==0): continue
        t = tags[i]
        s = BeautifulSoup(t.text, "html.parser")
        s2 = str(s).split('\n')
        s2 = [x for x in s2 if len(x) > 0] 
        if(s2[0]=="When"): break
        #this signals that we reached the end of the table
        #there are some record-setting expensive transfers after it
        #do not include these
        entry = {}
        entry["Date"] = s2[0] + ' ' + '2019'
        namepos = s2[1].split()
        entry["Player Position"] = namepos[-1]
        entry["Player Name"] = " ".join(namepos[:-1])
        entry["From"] = s2[2]
        entry["To"] = s2[3]
        entry["Transfer Details"] = s2[4]
        dict_list.append(entry)
    pd.DataFrame(dict_list).to_csv(output_filename)

In [7]:
#convert team names to city name
#then convert city name to (latitude, longitude) using Nominatim
#also determine if city is in specific countries using simple bounding box
#(accurate enough for this geography)

def team_to_location(team, geolocator):
    if team in team_to_city_dict: city = team_to_city_dict[team]
    else: city = team
    entry = {}
    entry['Team'] = team
    entry['City'] = city
    for region in regions:
        location = geolocator.geocode(city + ", " + region, timeout=10)
        time.sleep(2)
        if(not location is None): break
    if(location is None):
        print("Warning - could not find this city anywhere: " + city)
        return None
    else:
        lat = location.latitude
        long = location.longitude
        entry['Latitude'] = lat
        entry['Longitude'] = long
        entry['In UK'] = -6 <= long and long <= 2 and 50 <= lat and lat <= 56
        entry['In Germany'] = 6 <= long and long <= 15 and 49 <= lat and lat <= 55
        entry['In Spain'] = -7 <= long and long <= 4 and 36 <= lat and lat <= 44
        entry['In Italy'] = 6 <= long and long <= 18 and 36 <= lat and lat <= 48
        
    return entry

In [22]:
dmin = datetime.strptime("2019-07-01", "%Y-%m-%d")
dmax = datetime.strptime("2019-08-31", "%Y-%m-%d")

d = datetime.strptime('Aug 20 2019', "%b %d %Y")

In [23]:
d

datetime.datetime(2019, 8, 20, 0, 0)

In [24]:
dmax.month

8

In [22]:
def convert_for_tableau(filename, league):
    
    no_ext = lambda f : (".").join(f.split(".")[:-1])
    #remove filename extension
    
    geolocator = Nominatim(user_agent="soccer_transfers")
    
    dat_orig = pd.read_csv(filename)
    dat_conv = []
    #generate dataframe from list of dicts
    
    for i,row in dat_orig.iterrows():
        print("Processing entry %00d/%00d"%(i+1, dat_orig.shape[0]))
        #.csv from excel can have empty rows
        if(not isinstance(row["Player Name"], str)):
            break
        #skip entries that are not in Jul -  August 2019 
        d = datetime.strptime(row['Date'], "%m %d %Y")
        dmin = datetime.strptime("2019-07-01", "%Y-%m-%d")
        dmax = datetime.strptime("2019-08-31", "%Y-%m-%d")
        if(not(dmin <= d and d <= dmax) ): continue
            
        #ignore some incomplete entries
        if(not all([isinstance(row[field], str) for field in ["From", "To"]])):
            continue
        if(any([row[field]=="nan" for field in ["From", "To"]])):
            continue
            
        fromloc = team_to_location(row["From"], geolocator)
        toloc = team_to_location(row["To"], geolocator)
        if(fromloc is None or toloc is None): 
            continue
            
        from_foreign =  (not fromloc['In UK'] and league.lower()=="epl") or\
                        (not fromloc['In Germany'] and league.lower()=="bundesliga") or\
                        (not fromloc['In Spain'] and league.lower()=="laliga") or\
                        (not fromloc['In Italy'] and league.lower()=="seriea")
        to_foreign   =  (not toloc['In UK'] and league.lower()=="epl") or\
                        (not toloc['In Germany'] and league.lower()=="bundesliga") or\
                        (not toloc['In Spain'] and league.lower()=="laliga") or\
                        (not toloc['In Italy'] and league.lower()=="seriea")
        #these should normally be mutually exclusive
        #but we can have geocoding errors
        #so we may see some instances of category = 3
        #check number of these in tableau
        
        category     = 0 + 1*from_foreign + 2*to_foreign
            
        #the final dataframe for tableau will have two rows per single transfer
        #one with location data from origin, and one with location data from destination
        #this is required for tableau to plot lines on a map
        
        #create these two entries here:
            
        #set common fields of the two entries first
        orig_entry = {
                    "Player Name":row["Player Name"],\
                    "Player Position":row["Player Position"],\
                    "Transfer Details":row["Transfer Details"],\
                    "Path ID":no_ext(filename)+"-"+str(i),\
                    "League":league,\
                    "Category":category
                     }
        
        #make copy
        dest_entry = orig_entry.copy()
        
        #set fields that differ between origin and destination entry
        orig_entry["Origin-Destination"] = "Origin"
        dest_entry["Origin-Destination"] = "Destination"
        orig_entry.update(fromloc)       
        dest_entry.update(toloc)
        
        #add to list
        dat_conv.append(orig_entry)
        dat_conv.append(dest_entry)
        
    dat_conv = pd.DataFrame(dat_conv)
    dat_conv.to_csv(no_ext(filename)+"_tableau.csv", index=False)

In [None]:
scrape_to_csv("https://www.soccernews.com/soccer-transfers/german-bundesliga-transfers/", "bundesliga.csv")

In [None]:
scrape_to_csv("https://www.soccernews.com/soccer-transfers/english-premier-league-transfers/", "epl.csv")

In [205]:
scrape_to_csv("https://www.soccernews.com/soccer-transfers/spanish-la-liga-transfers/", "laliga.csv")

In [208]:
scrape_to_csv("https://www.soccernews.com/soccer-transfers/italian-serie-a-transfers/","seriea.csv")

In [11]:
convert_for_tableau("bundesliga.csv", "Bundesliga")

Processing entry 1/116
Processing entry 2/116
Processing entry 3/116
Processing entry 4/116
Processing entry 5/116
Processing entry 6/116
Processing entry 7/116
Processing entry 8/116
Processing entry 9/116
Processing entry 10/116
Processing entry 11/116
Processing entry 12/116
Processing entry 13/116
Processing entry 14/116
Processing entry 15/116
Processing entry 16/116
Processing entry 17/116
Processing entry 18/116
Processing entry 19/116
Processing entry 20/116
Processing entry 21/116
Processing entry 22/116
Processing entry 23/116
Processing entry 24/116
Processing entry 25/116
Processing entry 26/116
Processing entry 27/116
Processing entry 28/116
Processing entry 29/116
Processing entry 30/116
Processing entry 31/116
Processing entry 32/116
Processing entry 33/116
Processing entry 34/116
Processing entry 35/116
Processing entry 36/116
Processing entry 37/116
Processing entry 38/116
Processing entry 39/116
Processing entry 40/116
Processing entry 41/116
Processing entry 42/116
P

In [12]:
convert_for_tableau("epl.csv", "EPL")

Processing entry 1/110
Processing entry 2/110
Processing entry 3/110
Processing entry 4/110
Processing entry 5/110
Processing entry 6/110
Processing entry 7/110
Processing entry 8/110
Processing entry 9/110
Processing entry 10/110
Processing entry 11/110
Processing entry 12/110
Processing entry 13/110
Processing entry 14/110
Processing entry 15/110
Processing entry 16/110
Processing entry 17/110
Processing entry 18/110
Processing entry 19/110
Processing entry 20/110
Processing entry 21/110
Processing entry 22/110
Processing entry 23/110
Processing entry 24/110
Processing entry 25/110
Processing entry 26/110
Processing entry 27/110
Processing entry 28/110
Processing entry 29/110
Processing entry 30/110
Processing entry 31/110
Processing entry 32/110
Processing entry 33/110
Processing entry 34/110
Processing entry 35/110
Processing entry 36/110
Processing entry 37/110
Processing entry 38/110
Processing entry 39/110
Processing entry 40/110
Processing entry 41/110
Processing entry 42/110
P

In [20]:
convert_for_tableau("seriea.csv", "SerieA")

Processing entry 1/181
Processing entry 2/181
Processing entry 3/181
Processing entry 4/181
Processing entry 5/181
Processing entry 6/181
Processing entry 7/181
Processing entry 8/181
Processing entry 9/181
Processing entry 10/181
Processing entry 11/181
Processing entry 12/181
Processing entry 13/181
Processing entry 14/181
Processing entry 15/181
Processing entry 16/181
Processing entry 17/181
Processing entry 18/181
Processing entry 19/181
Processing entry 20/181
Processing entry 21/181
Processing entry 22/181
Processing entry 23/181
Processing entry 24/181
Processing entry 25/181
Processing entry 26/181
Processing entry 27/181
Processing entry 28/181
Processing entry 29/181
Processing entry 30/181
Processing entry 31/181
Processing entry 32/181
Processing entry 33/181
Processing entry 34/181
Processing entry 35/181
Processing entry 36/181
Processing entry 37/181
Processing entry 38/181
Processing entry 39/181
Processing entry 40/181
Processing entry 41/181
Processing entry 42/181
P

In [21]:
convert_for_tableau("laliga.csv", "LaLiga")

Processing entry 1/154
Processing entry 2/154
Processing entry 3/154
Processing entry 4/154
Processing entry 5/154
Processing entry 6/154
Processing entry 7/154
Processing entry 8/154
Processing entry 9/154
Processing entry 10/154
Processing entry 11/154
Processing entry 12/154
Processing entry 13/154
Processing entry 14/154
Processing entry 15/154
Processing entry 16/154
Processing entry 17/154
Processing entry 18/154
Processing entry 19/154
Processing entry 20/154
Processing entry 21/154
Processing entry 22/154
Processing entry 23/154
Processing entry 24/154
Processing entry 25/154
Processing entry 26/154
Processing entry 27/154
Processing entry 28/154
Processing entry 29/154
Processing entry 30/154
Processing entry 31/154
Processing entry 32/154
Processing entry 33/154
Processing entry 34/154
Processing entry 35/154
Processing entry 36/154
Processing entry 37/154
Processing entry 38/154
Processing entry 39/154
Processing entry 40/154
Processing entry 41/154
Processing entry 42/154
P