
# Webscraping - HTML Parser --> fetching several table data


In [1]:

# Let's look at robots.txt to see if any of the website's pages are prohibited from requesting.
import os
print(os.popen("curl https://www.tennismagazin.de/robots.txt").read())


User-agent: *



In [2]:

# importing libraries:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests
import re
from datetime import date


In [4]:

# getting data from 2 sites:
# https://www.tennismagazin.de/tennis-weltrangliste-herren/
# https://www.tennismagazin.de/tennis-weltrangliste-der-damen/
list_sites = ["herren","der-damen"]

for s in list_sites:
    
    # create a request to a tennis sports website with the response:
    #             scraping from the link (url) --> getting the data:
    try:
        url = f'https://www.tennismagazin.de/tennis-weltrangliste-{s}/'
        resp = requests.get(url)
        print("Scraping successful! 200 =",resp.status_code)
    except:
        print("Sorry, Scraping didn't work!")
        
    # parse web data
    soup = BeautifulSoup(resp.content, "html.parser")
    
    # find the table
    # getting the table head because it may contains headings --> column names:
    html_thead = soup.find_all('thead')[0]
    # getting all the rows (cells) in table head:
    html_tr = html_thead.find_all('tr')
    # create a list of the column names:
    for t in html_tr:
        #get the table headings
        th = (t.find_all('th'))
        #single values for the list of column names:
        headings = [s.text for s in th]
        
    # find the body
    # getting the table body:
    tbody = soup.table
    html_text = tbody.contents
    # getting all the rows (cells) in table body:
    # list to store all content
    content = []

    # loop through table body
    for tr in html_text:
        # getting all th, td (titles and data)
        th = tr.find_all(['th','td'])
        # storing all th value in row and removing "." in Points
        row = [i.text.replace(".","") for i in th]
        # append content 
        content.append(row)
    
    # create a dataframe:
    df01 = pd.DataFrame(content[1:],columns=headings)
    
    # changing the column names:
    df01 = df01.rename(columns={'Rang':'Ranking', 'Veränderung':'Ranking_Change', 'Name':'Players_Name',
                                'Alter':'Age', 'Punkte':'Points', 'Turniere':'Tournaments'})
    
    # no Ranking_Change get the value 0:
    df01["Ranking_Change"].replace({"": "0"},inplace=True)
    
    # convert categorical data to numeric --> ints/floats:
    df01.Ranking = df01.Ranking.astype(int)
    df01.Ranking_Change = df01.Ranking_Change.astype(int)
    df01.Age = df01.Age.astype(int)
    df01.Points = df01.Points.astype(int)
    df01.Tournaments = df01.Tournaments.astype(int)
    
    # date of today:
    date_today = date.today()
    
    # save the dataframe as csv-file:
    df01.to_csv(f"{s}{date_today}.csv",index=False)
    
    print(df01.head())
    
# create one csv-file of the two saved ones:
df_men = pd.read_csv(f"herren{date_today}.csv")
df_women = pd.read_csv(f"der-damen{date_today}.csv")

# changing the column names to have individually ones:
df_men = df_men.rename(columns={'Ranking':'Ranking_Men', 'Ranking_Change':'Ranking_Change_Men',
                                'Players_Name':'Players_Name_Men','Age':'Age_Men', 'Points':'Points_Men',
                                'Tournaments':'Tournaments_Men'})
df_women = df_women.rename(columns={'Ranking':'Ranking_Women', 'Ranking_Change':'Ranking_Change_Women',
                                'Players_Name':'Players_Name_Women','Age':'Age_Women', 'Points':'Points_Women',
                                'Tournaments':'Tournaments_Women'})

# putting the 2 files together and saving it:
df_ges = pd.concat([df_men,df_women],axis=1)
df_ges.to_csv(f"Ranking{date_today}_all.csv",index=False)


Scraping successful! 200 = 200
   Ranking  Ranking_Change      Players_Name  Age  Points  Tournaments
0        1               0    Novak Djokovic   36   11055           19
1        2               0    Carlos Alcaraz   20    8855           18
2        3               0   Daniil Medvedev   27    7555           21
3        4               0     Jannik Sinner   22    6490           22
4        5               0     Andrey Rublev   26    5010           25
Scraping successful! 200 = 200
   Ranking  Ranking_Change      Players_Name  Age  Points  Tournaments
0        1               0       Iga Swiatek   22    9880           19
1        2               0   Aryna Sabalenka   25    8905           16
2        3               1    Elena Rybakina   24    6811           18
3        4              -1        Coco Gauff   19    6660           19
4        5               0    Jessica Pegula   29    5905           20


In [None]:
df_men

In [None]:
df_women

In [None]:
df_ges