In [1]:
import os

# retrieve local copies of raw and prepared data from dvc
# checkout the readme for instructions about how to gain access

# os.system("dvc pull")

home = os.path.expanduser("~/transfermarkt-datasets") 
os.chdir(home)

In [2]:
# initialize the asset runner

import pandas as pd
from transfermarkt_datasets.transfermarkt_datasets import TransfermarktDatasets

td = TransfermarktDatasets()

In [3]:
# kickoff processing. this will load and post-process the data
# after the processing, raw and prepared datasets are available as pandas dataframes

td.build_assets()

transfermarkt_datasets - Start processing assets
Name               Path      Seasons
-----------------  --------  ------------------------------------------------------------
players            data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
player_valuations  data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
games              data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
clubs              data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
appearances        data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
competitions       data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
asset - Reading raw data from data/raw/2012/players.json
asset - Reading raw data from data/raw/2013/players.json
asset - Reading raw data from data/raw/2014/players.json
asset - Reading raw data from data/raw/2015/players.json
asset - Reading raw data from data/raw/2016/players.json

In [4]:
# list available assets

print(td.prettify_asset_processors())

Name               Path      Seasons
-----------------  --------  ------------------------------------------------------------
players            data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
player_valuations  data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
games              data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
clubs              data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
appearances        data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
competitions       data/raw  [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
# get a reference to one asset

players = td.assets["players"]
print(players)

Asset(name=players,season=2012..2021)


In [6]:
# get a reference to the pandas dataframes containing prepared data

clubs = td.assets["clubs"]

prep = clubs.prep_df

prep.describe()

Unnamed: 0,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_seats
count,365.0,400.0,371.0,400.0,358.0,400.0,400.0
mean,89.554904,24.6075,25.55283,10.8575,44.900838,4.9075,25428.9625
std,149.475461,7.978885,1.627803,6.436284,19.328392,4.987096,17770.337895
min,1.1,0.0,20.0,0.0,3.8,0.0,0.0
25%,9.0,24.0,24.6,6.0,29.2,1.0,12045.0
50%,25.11,26.0,25.6,11.0,46.2,3.0,20560.0
75%,93.8,28.25,26.65,15.0,58.0,8.0,33286.25
max,863.37,40.0,30.1,29.0,100.0,22.0,99354.0


In [7]:
# if needed, the asset raw data can be pulled into a df as well
clubs.get_stacked_data()

asset - Reading raw data from data/raw/2012/clubs.json
asset - Reading raw data from data/raw/2013/clubs.json
asset - Reading raw data from data/raw/2014/clubs.json
asset - Reading raw data from data/raw/2015/clubs.json
asset - Reading raw data from data/raw/2016/clubs.json
asset - Reading raw data from data/raw/2017/clubs.json
asset - Reading raw data from data/raw/2018/clubs.json
asset - Reading raw data from data/raw/2019/clubs.json
asset - Reading raw data from data/raw/2020/clubs.json
asset - Reading raw data from data/raw/2021/clubs.json


Unnamed: 0,type,href,parent,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,season,season_file
0,club,/kv-kortrijk/startseite/verein/601,"{'type': 'competition', 'country_id': '19', 'c...",23.63,30,24.6,23.0,76.7 %,7,Guldensporenstadion,9.399 Seats,+£198Th.,Hein Vanhaezebrouck,2012,data/raw/2012/clubs.json
1,club,/ksc-lokeren/startseite/verein/498,"{'type': 'competition', 'country_id': '19', 'c...",,0,,,,0,Daknamstadion,12.000 Seats,+-0,Peter Maes,2012,data/raw/2012/clubs.json
2,club,/vitoria-setubal-fc/startseite/verein/1085,"{'type': 'competition', 'country_id': '136', '...",2.59,28,26.0,9.0,32.1 %,2,Estádio do Bonfim,18.642 Seats,+-0,José Mota,2012,data/raw/2012/clubs.json
3,club,/sv-zulte-waregem/startseite/verein/3508,"{'type': 'competition', 'country_id': '19', 'c...",18.65,30,24.9,15.0,50.0 %,3,Regenboogstadion,12.250 Seats,£-180Th.,Francky Dury,2012,data/raw/2012/clubs.json
4,club,/moreirense-fc/startseite/verein/979,"{'type': 'competition', 'country_id': '136', '...",13.28,26,26.8,18.0,69.2 %,3,E. C. J. de Almeida Freitas,6.153 Seats,+£5.58m,Casquilha,2012,data/raw/2012/clubs.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,club,/bayer-04-leverkusen/startseite/verein/15,"{'type': 'competition', 'country_id': '40', 'c...",391.01,27,24.8,20.0,74.1 %,14,BayArena,30.210 Seats,£-11.03m,Gerardo Seoane,2021,data/raw/2021/clubs.json
1932,club,/spartak-moskau/startseite/verein/232,"{'type': 'competition', 'country_id': '141', '...",92.66,24,24.5,6.0,25.0 %,8,Otkrytie Bank Arena,45.360 Seats,£-10.82m,Paolo Vanoli,2021,data/raw/2021/clubs.json
1933,club,/arminia-bielefeld/startseite/verein/10,"{'type': 'competition', 'country_id': '40', 'c...",57.20,25,24.9,16.0,64.0 %,4,SchücoArena,26.515 Seats,£-8.37m,Frank Kramer,2021,data/raw/2021/clubs.json
1934,club,/vfl-wolfsburg/startseite/verein/82,"{'type': 'competition', 'country_id': '40', 'c...",212.85,28,25.0,18.0,64.3 %,12,Volkswagen Arena,30.000 Seats,£-53.85m,Florian Kohfeldt,2021,data/raw/2021/clubs.json
