
# Webscraping - HTML Parser --> fetching one table data


In [1]:

# Let's look at robots.txt to see if any of the website's pages are prohibited from requesting.
import os
print(os.popen("curl https://www.tennismagazin.de/robots.txt").read())


User-agent: *



In [2]:

# importing libraries:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
import requests
import re


In [3]:

# create a request to a tennis sports website with the response:
#             scraping from the link (url) --> getting the data:
try:
    url = 'https://www.tennismagazin.de/tennis-weltrangliste-herren/'
    resp = requests.get(url)
    print("Scraping successful! 200 =",resp.status_code)
except:
    print("Sorry, Scraping didn't work!")


Scraping successful! 200 = 200


In [4]:

# parse web data
soup = BeautifulSoup(resp.content, "html.parser")


In [5]:

# find the table

# getting the table head because it may contains headings --> column names:
html_thead = soup.find_all('thead')[0]
# getting all the rows (cells) in table head:
html_tr = html_thead.find_all('tr')
# create a list of the column names:
for t in html_tr:
    #get the table headings
    th = (t.find_all('th'))
    #single values for the list of column names:
    headings = [s.text for s in th]


In [6]:
html_thead

<thead><tr><th>Rang</th><th>Veränderung</th><th>Name</th><th>Alter</th><th>Punkte</th><th>Turniere</th></tr></thead>

In [7]:
headings

['Rang', 'Veränderung', 'Name', 'Alter', 'Punkte', 'Turniere']

In [8]:

# find the body

# getting the table body:
tbody = soup.table
html_text = tbody.contents
# getting all the rows (cells) in table body:
# list to store all content
content = []

# loop through table body
for tr in html_text:
    # getting all th, td (titles and data)
    th = tr.find_all(['th','td'])
    # storing all th value in row and removing white space
    row = [i.text.strip() for i in th]
    # append content 
    content.append(row)


In [9]:
content[:5]

[['Rang', 'Veränderung', 'Name', 'Alter', 'Punkte', 'Turniere'],
 ['1', '', 'Novak Djokovic', '36', '11.055', '19'],
 ['2', '', 'Carlos Alcaraz', '20', '8.855', '18'],
 ['3', '', 'Daniil Medvedev', '27', '7.555', '21'],
 ['4', '', 'Jannik Sinner', '22', '6.490', '22']]

In [10]:
html_text[1]

<tr><td>1</td><td><span></span></td><td><label style="cursor:default;"><span class="flag-icon flag-icon-rs"></span> <a href="https://www.tennismagazin.de/profil/herren/novak-djokovic/">Novak Djokovic</a></label></td><td>36</td><td>11.055</td><td>19</td></tr>

In [11]:

# create a dataframe:
df01 = pd.DataFrame(content[1:],columns=headings)


In [12]:

# dataframe with 6 features and 100 samples:
df01.shape


(100, 6)

In [13]:
df01

Unnamed: 0,Rang,Veränderung,Name,Alter,Punkte,Turniere
0,1,,Novak Djokovic,36,11.055,19
1,2,,Carlos Alcaraz,20,8.855,18
2,3,,Daniil Medvedev,27,7.555,21
3,4,,Jannik Sinner,22,6.490,22
4,5,,Andrey Rublev,26,5.010,25
...,...,...,...,...,...,...
95,96,+20,James Duckworth,31,628,26
96,97,-1,Arthur Rinderknech,28,626,28
97,98,-7,Maximilian Marterer,28,626,21
98,99,+5,Hugo Gaston,23,626,29


In [14]:

df01.columns


Index(['Rang', 'Veränderung', 'Name', 'Alter', 'Punkte', 'Turniere'], dtype='object')

In [15]:

# changing the column names:
df01 = df01.rename(columns={'Rang':'Ranking', 'Veränderung':'Ranking_Change', 'Name':'Players_Name',
                            'Alter':'Age', 'Punkte':'Points', 'Turniere':'Tournaments'})


In [16]:

df01[:5]


Unnamed: 0,Ranking,Ranking_Change,Players_Name,Age,Points,Tournaments
0,1,,Novak Djokovic,36,11.055,19
1,2,,Carlos Alcaraz,20,8.855,18
2,3,,Daniil Medvedev,27,7.555,21
3,4,,Jannik Sinner,22,6.49,22
4,5,,Andrey Rublev,26,5.01,25


In [17]:

#df01[['Ranking','Ranking_Change','Players_Name','Age','Points','Tournaments']].value_counts()
# no null values exist:
df01.isnull().sum()


Ranking           0
Ranking_Change    0
Players_Name      0
Age               0
Points            0
Tournaments       0
dtype: int64

In [18]:

# no Ranking_Change get the value 0:
df01["Ranking_Change"].replace({"": "0"},inplace=True)


In [19]:
df01

Unnamed: 0,Ranking,Ranking_Change,Players_Name,Age,Points,Tournaments
0,1,0,Novak Djokovic,36,11.055,19
1,2,0,Carlos Alcaraz,20,8.855,18
2,3,0,Daniil Medvedev,27,7.555,21
3,4,0,Jannik Sinner,22,6.490,22
4,5,0,Andrey Rublev,26,5.010,25
...,...,...,...,...,...,...
95,96,+20,James Duckworth,31,628,26
96,97,-1,Arthur Rinderknech,28,626,28
97,98,-7,Maximilian Marterer,28,626,21
98,99,+5,Hugo Gaston,23,626,29


In [20]:

# convert categorical data to numeric --> ints/floats:
df01.Ranking = df01.Ranking.astype(int)
df01.Ranking_Change = df01.Ranking_Change.astype(int)
df01.Age = df01.Age.astype(int)
df01.Tournaments = df01.Tournaments.astype(int)


In [21]:

# remove the points in the numbers of points:
df01.Points = [i.replace(".","") for i in df01.Points]


In [22]:

# categorical to numeric in the feature Points:
df01.Points = df01.Points.astype(int)


In [23]:
df01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Ranking         100 non-null    int32 
 1   Ranking_Change  100 non-null    int32 
 2   Players_Name    100 non-null    object
 3   Age             100 non-null    int32 
 4   Points          100 non-null    int32 
 5   Tournaments     100 non-null    int32 
dtypes: int32(5), object(1)
memory usage: 2.9+ KB


In [24]:

df01.describe()


Unnamed: 0,Ranking,Ranking_Change,Age,Points,Tournaments
count,100.0,100.0,100.0,100.0,100.0
mean,50.5,0.67,26.87,1555.01,25.75
std,29.011492,5.00314,4.196331,1695.051631,3.599593
min,1.0,-19.0,19.0,621.0,18.0
25%,25.75,-1.0,24.0,736.5,23.0
50%,50.5,0.0,26.0,918.5,26.0
75%,75.25,3.0,29.0,1435.0,28.0
max,100.0,20.0,38.0,11055.0,34.0


In [25]:

df01


Unnamed: 0,Ranking,Ranking_Change,Players_Name,Age,Points,Tournaments
0,1,0,Novak Djokovic,36,11055,19
1,2,0,Carlos Alcaraz,20,8855,18
2,3,0,Daniil Medvedev,27,7555,21
3,4,0,Jannik Sinner,22,6490,22
4,5,0,Andrey Rublev,26,5010,25
...,...,...,...,...,...,...
95,96,20,James Duckworth,31,628,26
96,97,-1,Arthur Rinderknech,28,626,28
97,98,-7,Maximilian Marterer,28,626,21
98,99,5,Hugo Gaston,23,626,29
