In [20]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np

In [136]:
class Bundesliga(object):
    
    '''
    Webscraper for www.bundesliga.de
    
    Example: Bundesliga().get_spieltag("2015/2016",12)
    Output: pandas dataframe with all match and team stats for respective Spieltag in season
    
    '''
    
    
    global selenium_path, url
    
    selenium_path = '/Users/Felix/Dropbox/03_Python/_seleniumdriver/chrome_mac/chromedriver' # change path if using other machine
    url = "http://www.bundesliga.com/de/bundesliga/spieltag/2014-2015/?d=2#" # WICHTIG: LINK öffnet immer Statistik Seite für aktuellsten Spieltag (d.h. Link ist quasi nur der Türöffner, um Analyse zu beginnen)
    
    def __init__(self):
        self.url = url
        self.driver = self._init_driver()
       
    def get_spieltag(self, saison, spieltag):
        
        '''Function grabs data for each game in specified Spieltag:
        input: Season = z.B. 2012/2013 (as string), spieltag = z.B. 3 (as integer)
        output: pandas mit Statistiken pro Spiel und pro Mannschaft
        '''
        
        ### Load Bundesliga-Homepage auf der Statistiken sind
        self._load_homepage()
        timeout = 5
        timestep = 0.05
        
        ### Load given season
        XPATH_SEASON = "//select[@id='season']/option[text()= %r]" %saison
        WebDriverWait(self.driver,timeout, timestep).until(EC.visibility_of_element_located([By.XPATH, XPATH_SEASON])).click()
        WebDriverWait(self.driver,timeout, timestep).until(EC.visibility_of_element_located([By.XPATH, "//input[@id='load']"])).click()
        
        ### Load correct spieltag
        XPATH_SPIELTAG = "//*[contains(@class, 'has-data') and text() ='%d']" %spieltag 
        WebDriverWait(self.driver,timeout, timestep).until(EC.visibility_of_element_located([By.XPATH, XPATH_SPIELTAG])).click()

        ### bestimme Anzahl der Spiele (z.B. falls Sonntagsspiele noch nicht verfügbar)
        XPATH_GAMES = "//*[contains(text(), 'mehr zum Spiel')]" # tab "mehr zum Spiel"
        anz_spiele = len(self.driver.find_elements_by_xpath(XPATH_GAMES))
    
        ### bestimme Datum und Uhrzeit von Spielen
        XPATH_DATE = "//span[@class='longDate']"
        datum_uhrzeiten = [datum.text for datum in self.driver.find_elements_by_xpath(XPATH_DATE)]
        
        
        
        ### Grabbe verfügbare Spiele (in der Regel 9 Spiele)
        all_games = pd.DataFrame() # alle Spiele werden hier gespeichert.
        
        
        ##### TEMPORARY!!!!
        #anz_spiele = 1 
        XPATH_GAMEHEADER = "//h3/span[@class='longHead']"
        
        for i in range(0,anz_spiele):
            
            game = self.driver.find_elements_by_xpath(XPATH_GAMES)[i]
            game.click()
            
            ## Lade Spielstatistiken (warte vorher bis Spiel-Titel sichtbar)
            WebDriverWait(self.driver,timeout, timestep).until(EC.visibility_of_element_located([By.XPATH, XPATH_GAMEHEADER]))
            
            
            spiel_header = self.driver.find_elements_by_xpath(XPATH_GAMEHEADER)[0].text
            spiel_datum = datum_uhrzeiten[i]
            
            result = self.driver.execute_script(""" return matchData""") #lade Daten via JavaScript

            # Lese Spielstatistiken aus
            homeid, awayid = result["homeID"], result["awayID"]

            stats_home = result["teams"][str(homeid)]["teamStats"]
            stats_away = result["teams"][str(awayid)]["teamStats"]

            df_home = pd.DataFrame([list(stats_home.values())], index = ["Home"],columns = list(stats_home.keys())).T
            df_away = pd.DataFrame([list(stats_away.values())], index = ["Away"],columns = list(stats_away.keys())).T

            df_all = df_home.merge(df_away, left_index=True,right_index=True).T

            # Summarize further data:
            team_home = result["teams"][str(homeid)]["teamName"]
            team_away = result["teams"][str(awayid)]["teamName"]

            coach_home = result["teams"][str(homeid)]["coach"]
            coach_away = result["teams"][str(awayid)]["coach"]

            score_home = result["teams"][str(homeid)]["teamScore"]
            score_away = result["teams"][str(awayid)]["teamScore"]

            df_other_stats = pd.DataFrame({"Datum":[spiel_datum, spiel_datum],"Titel":[spiel_header,spiel_header],"Saison":[saison,saison],"Spieltag":[spieltag, spieltag],"Spiel":[i+1, i+1],
                                       "Teamname":[team_home,team_away], "Tore":[score_home,score_away],"Coach":[coach_home, coach_away]},
                                      index = ["Home","Away"])

            df_all = pd.concat([df_other_stats,df_all], axis = 1)

            all_games = pd.concat([all_games,df_all], axis = 0)
            
            
            ## Go back to Spieltags overview
            WebDriverWait(self.driver,timeout, timestep).until(EC.visibility_of_element_located([By.XPATH, XPATH_SPIELTAG])).click()
            
        self._quit_driver()
            
        return all_games
            

    def _init_driver(self):
        
        '''Initializes Selenium Webdriver '''
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option("prefs", {"profile.default_content_settings.cookies": 2, "profile.managed_default_content_settings.images": 2})
        driver = webdriver.Chrome(selenium_path,chrome_options=chrome_options)
        driver.maximize_window()
        return driver
    
    def _load_homepage(self):
        
        '''Load url & accept cookies'''
        try:
            self.driver.get(url)

            ## Accept Cookies
            XPATH_COOKIE = "//span[contains(@class, 'ok')]"
            okbutton = WebDriverWait(self.driver, 5, 0.25).until(EC.visibility_of_element_located([By.XPATH, XPATH_COOKIE]))
            if okbutton:
                okbutton.click()
        except:
            print("Could not load url")
    
    def _quit_driver(self):
        self.driver.quit()
  

## Lade Daten von Bundesliga.DE

In [137]:
### Reload correct season
seasons = ["2009/2010","2010/2011","2011/2012","2012/2013", "2013/2014","2014/2015","2015/2016", "2016/2017"]
spieltage = np.arange(1,35,1)

all_data = pd.DataFrame()
for season in seasons:
    print(season)
    for spieltag in spieltage:
        
        try:
            app = Bundesliga()
            data = app.get_spieltag(season,spieltag)
            print(spieltag, " loaded")
        except:
            data = pd.DataFrame([season, spieltag, "Data not loaded"], index = ["Saison", "Spieltag", "Teamname"] ).T
            print(spieltag, " not loaded. Error!")
            driver.quit()

        all_data = pd.concat([all_data,data], axis = 0)

2009/2010
1  loaded
2  loaded
3  loaded
4  loaded
5  loaded
6  loaded
7  loaded
8  loaded
9  loaded
10  loaded
11  loaded
12  loaded
13  loaded
14  loaded
15  loaded
16  loaded
17  loaded
18  loaded
19  loaded
20  loaded
21  loaded
22  loaded
23  loaded
24  loaded
25  loaded
26  loaded
27  loaded
28  loaded
29  loaded
30  loaded
31  loaded
32  loaded
33  loaded
34  loaded
2010/2011
1  loaded
2  loaded
3  loaded
4  loaded
5  loaded
6  loaded
7  loaded
8  loaded
9  loaded
10  loaded
11  loaded
12  loaded
13  loaded
14  loaded
15  loaded
16  loaded
17  loaded
18  loaded
19  loaded
20  loaded
21  loaded
22  loaded
23  loaded
24  loaded
25  loaded
26  loaded
27  loaded
28  loaded
29  loaded
30  loaded
31  loaded
32  loaded
33  loaded
34  loaded
2011/2012
1  loaded
2  loaded
3  loaded
4  loaded
5  loaded
6  loaded
7  loaded
8  loaded
9  loaded
10  loaded
11  loaded
12  loaded
13  loaded
14  loaded
15  loaded
16  loaded
17  loaded
18  loaded
19  loaded
20  loaded
21  loaded
22  loaded
23  loa

In [138]:
import pickle
# Store data (serialize)
with open('Spieltagsdaten_Bundesliga_de_raw.pickle', 'wb') as handle:
    pickle.dump(all_data, handle, protocol=pickle.HIGHEST_PROTOCOL)