In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
chrome_options = webdriver.ChromeOptions()
# options not to use an actual Chrome window (delete the two lines if needed)
chrome_options.add_argument('--headless') 
chrome_options.add_argument('window-size=1920x1080');

# add the path to your chromedriver
driver = webdriver.Chrome('/path/to/chromedriver', chrome_options=chrome_options)

driver.get('https://www.bundestag.de/abgeordnete/biografien')

try:
    # explicit wait for button to appear
    button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@title="Listenansicht"]'))
    )
    button.click()

    # explicit wait for new element to verify page is loaded
    new_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@title="Bilderansicht"]'))
    )
    sel_soup = BeautifulSoup(driver.page_source, "html.parser")

except:
    driver.quit()

# isolate relevant part of the html with relevant links to individual pages
html_list = sel_soup.find("ul", {"class", "bt-list-holder"})

In [12]:
url_list = []

# pull all links from html
for link in html_list.find_all("a", {"class" : "bt-open-in-overlay"}):
    url_list.append(str("https://www.bundestag.de")+link.get("href"))    

url_list = [k for k in url_list if 'biografien' in k]

len(url_list)

In [None]:
mdb_df = pd.DataFrame()

for url in tqdm(url_list):
    r = requests.get(url)

    temp_soup = BeautifulSoup(r.content, "html.parser")

    temp_df = pd.DataFrame()

    # get title section with name and party of the mdb
    title = temp_soup.find('div', attrs={'class': 'col-xs-8 col-md-9 bt-biografie-name'})
    title = title.findChildren("h3")[0].text.strip()

    # isolate name and party from string
    name = title.split(",",1)[0]
    party = title.split(", ",1)[1]

    temp_df["name"] = [name]
    temp_df["party"] = [party]

    # loop over external link section of the page and get title and url
    for link in temp_soup.find_all("a", {"class" : "bt-link-extern"}):
        category = str(link.get("title")).lower()
        url = str(link.get("href"))

        temp_df[category] = [url]
    
    mdb_df = mdb_df.append(temp_df, ignore_index = True)


In [None]:
# inspect dataframe
mdb_df.head()

In [None]:
# clean and export data frame as csv
mdb_df_clean = mdb_df[["name", "party", "homepage", "twitter", "instagram", "facebook", "tiktok", "youtube", "linkedin", "xing"]]
mdb_df_clean.to_csv("mdb_social_media.csv")