In [176]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

baseUrl = "https://www.transfermarkt.co.uk";
# Load the webpage containing the data
# The very first thing that we are going to do is create a variable called 
# ‘headers’ and assign it a string that will tell the website that we are a browser, 
# and not a scraping tool. In short, we’ll be blocked if we are thought to be scraping!
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2526.106 Safari/537.36'}

page = baseUrl + "/premier-league/startseite/wettbewerb/GB1";
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

class Team(object):
    """__init__() functions as the class constructor"""
    def __init__(self, name=None, smallName=None, link=None, codeName=None, idx=None, city=None):
        self.name = name
        self.smallName = smallName
        self.link = link
        self.codeName = codeName
        self.idx = idx
        self.city = city

teams = [];

# Locate the data within a page & extract it
for teamContainer in pageSoup.find_all('td', {"class": "hauptlink no-border-links hide-for-small hide-for-pad"}):
    name = teamContainer.text; # get team name
    link = teamContainer.find('a', {"class": "vereinprofil_tooltip"}); # get team link, but save full url of team
    codeName = link['href'].split('/')[1] # get codeName of each team
    idx = link['href'].split('/')[4] # get team index number
    
    # get team location
    pageLocation = baseUrl + "/" + codeName + "/datenfakten/verein/" + idx;
    pageLocationTree = requests.get(pageLocation, headers=headers)
    pageLocationSoup = BeautifulSoup(pageLocationTree.content, 'html.parser')

    for teamInfo in pageLocationSoup.find_all('table', {"class": "profilheader"}):
        info = teamInfo.find_all('td');
        city = info[2].text.split('(')[0][7:]
    
    teams.append(Team(name, "", baseUrl + link['href'], codeName, idx, city))

In [178]:
# Locate the data within a page & extract it
i = 0;
for teamSmallNameContainer in pageSoup.find_all('td', {"class": "hauptlink no-border-links show-for-small show-for-pad"}):
    smallName = teamSmallNameContainer.text; # get team's small name
    teams[i].smallName = smallName
    i = i + 1

In [179]:
# fix team city manually
teams[0].city = "Manchester"
teams[1].city = "Liverpool"
teams[2].city = "London"
teams[3].city = "London"
teams[4].city = "Manchester"
teams[5].city = "London"
teams[6].city = "Liverpool"
teams[7].city = "Leicester"
teams[8].city = "London"
teams[9].city = "Southampton"
teams[10].city = "London"
teams[11].city = "London"
teams[12].city = "Wolverhampton"
teams[13].city = "Bournemouth"
teams[14].city = "Burnley"
teams[15].city = "Brighton"
teams[16].city = "Newcastle"
teams[17].city = "Watford"
teams[18].city = "Huddersfield"
teams[19].city = "Cardiff"

In [180]:
# write team data to file
f = open("teamData.txt", "w")
for team in teams:
    line = team.name + "!" + team.smallName + "!" + team.link + "!" + team.codeName + "!" + team.idx + "!" + team.city + "\n"; # seperate team data with !
    line = line.encode('utf-8', 'ignore')
    f.write(line)

In [102]:
# get players of all teams
class Player(object):
    """__init__() functions as the class constructor"""
    def __init__(self, name=None, link=None, codeName=None, idx=None, teams=None):
        self.name = name
        self.link = link
        self.codeName = codeName
        self.idx = idx
        self.teams = teams

players = [];

for team in teams:
    pagePlayers = team.link
    pagePlayersTree = requests.get(pagePlayers, headers=headers)
    pagePlayersSoup = BeautifulSoup(pagePlayersTree.content, 'html.parser')

    i = 1;
    for playerInfo in pagePlayersSoup.find_all('a', {"class": "spielprofil_tooltip"}):
        if (i % 2 != 0):
            name = playerInfo.text
            idx = playerInfo['id']
            link = playerInfo['href']
            codeName = link.split('/')[1]
            players.append(Player(name, baseUrl + link, codeName, idx))
        i = i + 1

In [157]:
# store team names in an array
tNames = []
for tName in teams:
    tNames.append(tName.name)

[u'Manchester City ', u'Liverpool FC ', u'Chelsea FC ', u'Tottenham Hotspur ', u'Manchester United ', u'Arsenal FC ', u'Everton FC ', u'Leicester City ', u'West Ham United ', u'Southampton FC ', u'Fulham FC \xa0', u'Crystal Palace ', u'Wolverhampton Wanderers \xa0', u'AFC Bournemouth ', u'Burnley FC ', u'Brighton & Hove Albion ', u'Newcastle United ', u'Watford FC ', u'Huddersfield Town ', u'Cardiff City \xa0']


In [160]:
#get players past teams
for player in players:
    pagePlayersTeams = player.link
    pagePlayersTeamsTree = requests.get(pagePlayersTeams, headers=headers)
    pagePlayersTeamsSoup = BeautifulSoup(pagePlayersTeamsTree.content, 'html.parser')

    transferBox = pagePlayersTeamsSoup.find('div', {"class": "box transferhistorie"})

    tempPlayerTeams = [];
    # find player teams
    for playerTeam in transferBox.find_all('td', {"class": "hauptlink no-border-links hide-for-small vereinsname"}):
        teamName = playerTeam.find('a', {"class": "vereinprofil_tooltip"})
        if teamName:
            if teamName.text not in tempPlayerTeams: # avoid duplicate teams
                tempPlayerTeams.append(teamName.text)
    player.teams = tempPlayerTeams;

In [161]:
# write players data to file
f = open("playersData.txt", "w")
for player in players:
    line = player.name + "!" + player.link + "!" + player.codeName + "!" + player.idx + "!"; # seperate team data with !
    i = 1
    for playerTeam in player.teams:
        if i == len(player.teams):
            line = line + playerTeam
        else:
            line = line + playerTeam + ","
        i = i + 1
    line = line + "\n"
    line = line.encode('utf-8', 'ignore')
    f.write(line)