# Scaping Data From Real Website

In [None]:
%pip install beautifulsoup4

In [None]:
# importing necessary libraries

from bs4 import BeautifulSoup
import requests

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_Europe_by_revenue"  # Extracting data from this URL

page = requests.get(url)  # Getting the page content

soup = BeautifulSoup(page.text, "html")  # Parsing the page content with BeautifulSoup

In [None]:
print(soup)  # Printing the parsed content to verify successful scraping

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of largest companies in Europe by revenue - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

In [None]:
# Finding the specific table in the parsed content

table = soup.find("table", class_ = "wikitable sortable col4right")

In [None]:
print(table)  # Printing the table to verify successful extraction

<table class="wikitable sortable col4right">
<tbody><tr>
<th>Rank</th>
<th>Company</th>
<th>Industry</th>
<th>Revenue<br/>(US$ billions)</th>
<th width="150">Headquarters
</th></tr>
<tr>
<td>1</td>
<td><a href="/wiki/Volkswagen_Group" title="Volkswagen Group">Volkswagen</a></td>
<td>Automotive</td>
<td>348.408</td>
<td><span data-sort-value="Germany"><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="1000" decoding="async" height="14" src="//upload.wikimedia.org/wikipedia/en/thumb/b/ba/Flag_of_Germany.svg/40px-Flag_of_Germany.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/ba/Flag_of_Germany.svg/60px-Flag_of_Germany.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Germany" title="Germany">Germany</a></span>
</td></tr>
<tr>
<td>2</td>
<td><a href="/wiki/Shell_plc" title="Shell plc">Shell</a></td>
<td>Oil and gas</td>
<td>323.183</td>
<td><span data-sort-value="Un

In [None]:
# Extracting the titles of the columns in the table

europe_titles = table.find_all("th")

In [None]:
europe_titles  # This will give us the titles of the columns in the table, which we can use for further processing
# Note: The titles will be in the form of a list of BeautifulSoup objects. 

[<th>Rank</th>,
 <th>Company</th>,
 <th>Industry</th>,
 <th>Revenue<br/>(US$ billions)</th>,
 <th width="150">Headquarters
 </th>]

In [None]:
# We can convert them to a list of strings for easier handling.

europe_table_titles = [title.text.strip() for title in europe_titles]

print(europe_table_titles)

['Rank', 'Company', 'Industry', 'Revenue(US$ billions)', 'Headquarters']


## Scraping the rows from the table

In [None]:
# Importing pandas for data manipulation and analysis

import pandas as pd

In [None]:
column_data = table.find_all("tr")  # Extracting all rows from the table

In [None]:
column_data  # This will give us a list of all rows in the table, which we can process further to extract data.

[<tr>
 <th>Rank</th>
 <th>Company</th>
 <th>Industry</th>
 <th>Revenue<br/>(US$ billions)</th>
 <th width="150">Headquarters
 </th></tr>,
 <tr>
 <td>1</td>
 <td><a href="/wiki/Volkswagen_Group" title="Volkswagen Group">Volkswagen</a></td>
 <td>Automotive</td>
 <td>348.408</td>
 <td><span data-sort-value="Germany"><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="1000" decoding="async" height="14" src="//upload.wikimedia.org/wikipedia/en/thumb/b/ba/Flag_of_Germany.svg/40px-Flag_of_Germany.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/ba/Flag_of_Germany.svg/60px-Flag_of_Germany.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Germany" title="Germany">Germany</a></span>
 </td></tr>,
 <tr>
 <td>2</td>
 <td><a href="/wiki/Shell_plc" title="Shell plc">Shell</a></td>
 <td>Oil and gas</td>
 <td>323.183</td>
 <td><span data-sort-value="United Kingdom"><span class="fla

In [None]:
# Extracting data from each row in the table

all_rows_data = []

for row in column_data[1:]:
    row_data = row.find_all("td")
    individual_row_data = [data.text.strip() for data in row_data]
    all_rows_data.append(individual_row_data)

In [None]:
# Creating a DataFrame from the extracted data

df = pd.DataFrame(all_rows_data, columns=europe_table_titles)  # This will create a DataFrame with the extracted data and the column titles
df.loc[len(df)] = individual_row_data

In [None]:
print(df.head())  # Check first few rows

  Rank        Company      Industry Revenue(US$ billions)    Headquarters
0    1     Volkswagen    Automotive               348.408         Germany
1    2          Shell   Oil and gas               323.183  United Kingdom
2    3  TotalEnergies   Oil and gas               218.945          France
3    4       Glencore  Conglomerate               217.829     Switzerland
4    5             BP   Oil and gas               213.032  United Kingdom


In [148]:
df

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters
0,1,Volkswagen,Automotive,348.408,Germany
1,2,Shell,Oil and gas,323.183,United Kingdom
2,3,TotalEnergies,Oil and gas,218.945,France
3,4,Glencore,Conglomerate,217.829,Switzerland
4,5,BP,Oil and gas,213.032,United Kingdom
...,...,...,...,...,...
95,96,Lufthansa,Transportation,40.455,Germany
96,97,ThyssenKrupp,Conglomerate,40.027,Germany
97,98,Inditex,Retail,35.799,Spain
98,99,Schneider Electric,Electrical equipment,38.812,France


Now we have our dataset as `df`

In [149]:
df.head(10)

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters
0,1,Volkswagen,Automotive,348.408,Germany
1,2,Shell,Oil and gas,323.183,United Kingdom
2,3,TotalEnergies,Oil and gas,218.945,France
3,4,Glencore,Conglomerate,217.829,Switzerland
4,5,BP,Oil and gas,213.032,United Kingdom
5,6,Stellantis,Automotive,204.908,Netherlands
6,7,BMW,Automotive,168.902,Germany
7,8,Mercedes-Benz Group,Automotive,165.637,Germany
8,9,Électricité de France,Electric utility,151.04,France
9,10,Banco Santander,Financial services,137.244,Spain


In [150]:
df.columns

Index(['Rank', 'Company', 'Industry', 'Revenue(US$ billions)', 'Headquarters'], dtype='object')

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Rank                   100 non-null    object
 1   Company                100 non-null    object
 2   Industry               100 non-null    object
 3   Revenue(US$ billions)  100 non-null    object
 4   Headquarters           100 non-null    object
dtypes: object(5)
memory usage: 4.0+ KB


In [152]:
df.describe()

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters
count,100,100,100,100.0,100
unique,100,99,32,100.0,17
top,1,Crédit Agricole,Financial services,348.408,Germany
freq,1,2,25,1.0,26
