# Web Scraping Multiple Pages

In [115]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unidecode

#### Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [2]:
url = 'https://en.wikipedia.org/wiki/Python'

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [14]:
links = ['https://en.wikipedia.org/'+ tag['href'] for tag in soup.select('a[href^="/wiki"]')]
len(links)

60

#### Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'

In [18]:
url = 'https://uscode.house.gov/download/download.shtml'
response = requests.get(url)
response.status_code

200

In [22]:
soup = BeautifulSoup(response.content, "html.parser")

In [21]:
len([tag for tag in soup.select('.uscitemlist .usctitlechanged')])

14

#### Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

In [23]:
url = 'https://www.fbi.gov/wanted/topten'
response = requests.get(url)
response.status_code

200

In [24]:
soup = BeautifulSoup(response.content, "html.parser")

In [31]:
most_wanted_names = [tag.get_text() for tag in soup.select('.full-grid h3.title a')]
len(most_wanted_names)

10

#### Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://www.emsc-csem.org/Earthquake/'

In [56]:
url = 'https://www.emsc-csem.org/Earthquake/'
response = requests.get(url)
response.status_code

200

In [57]:
soup = BeautifulSoup(response.content, "html.parser")

In [71]:
date_times = [tag.get_text().split() for tag in soup.select('#tbody td.tabev6 > b > a')][:20]

dates = [item[0] for item in date_times]
times = [item[1] for item in date_times]
len(date_times)

20

In [70]:
lattitude_values = [tag.get_text().strip() for tag in soup.select('#tbody tr td:nth-child(5)')][:20]
directions = [tag.get_text().strip() for tag in soup.select('#tbody tr td:nth-child(6)')][:20]

lattitudes = [x + ' ' + y for x, y in zip(lattitude_values, directions)]
len(lattitudes)

20

In [74]:
longitude_values = [tag.get_text().strip() for tag in soup.select('#tbody tr td:nth-child(7)')][:20]
directions = [tag.get_text().strip() for tag in soup.select('#tbody tr td:nth-child(8)')][:20]

longitudes = [x + ' ' + y for x, y in zip(longitude_values, directions)]
len(longitudes)

20

In [79]:
regions = [tag.get_text().strip() for tag in soup.select('#tbody .tb_region')][:20]
len(regions)

20

In [80]:
data = pd.DataFrame({
    'date': dates,
    'time': times,
    'lattitude': lattitudes,
    'longitude': longitudes,
    'region': regions
})

data.head()

Unnamed: 0,date,time,lattitude,longitude,region
0,2023-01-09,23:32:38.0,5.52 N,126.63 E,"MINDANAO, PHILIPPINES"
1,2023-01-09,23:25:53.4,43.16 S,173.11 E,"OFF E. COAST OF S. ISLAND, N.Z."
2,2023-01-09,23:23:52.3,31.67 N,104.41 W,WESTERN TEXAS
3,2023-01-09,23:00:00.4,35.60 N,25.64 E,"CRETE, GREECE"
4,2023-01-09,22:56:50.9,39.36 N,26.23 E,NEAR THE COAST OF WESTERN TURKEY


#### List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'

In [144]:
url = 'https://www.wikipedia.org/'
response = requests.get(url)
response.status_code

200

In [145]:
soup = BeautifulSoup(response.content, "html.parser")

In [146]:
languages = [tag.get_text() for tag in soup.select('strong')[1:11]]

In [147]:
related_articles = [unidecode.unidecode(tag.get_text()) for tag in soup.select('.central-featured-lang a small bdi')]

In [148]:
languages_articles = [f'{x}: {y}' for x, y in zip(languages, related_articles)]

In [151]:
languages_articles[-2] = f'{related_articles[-2]}: {languages[-2]}'
languages_articles

['English: 6 585 000+',
 '日本語: 1 354 000+',
 'Русский: 1 875 000+',
 'Français: 2 477 000+',
 'Deutsch: 2 751 000+',
 'Español: 1 823 000+',
 'Italiano: 1 785 000+',
 '中文: 1 323 000+',
 '941 000+: فارسی',
 'Português: 1 096 000+']

#### A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [154]:
url = 'https://data.gov.uk/'
response = requests.get(url)
response.status_code

200

In [155]:
soup = BeautifulSoup(response.content, "html.parser")

In [159]:
[tag.get_text() for tag in soup.select('.govuk-grid-column-full h3')]

['Business and economy',
 'Crime and justice',
 'Defence',
 'Education',
 'Environment',
 'Government',
 'Government spending',
 'Health',
 'Mapping',
 'Society',
 'Towns and cities',
 'Transport',
 'Digital service performance',
 'Government reference data']

#### Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [161]:
url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'
response = requests.get(url)
response.status_code

200

In [162]:
soup = BeautifulSoup(response.content, "html.parser")

In [178]:
top10_languages = [tag.get_text() for tag in soup.select('.wikitable tbody tr td:nth-child(1) a:nth-child(1)')][:10]
natives_speakers = [tag.get_text().strip() for tag in soup.select('.wikitable tbody tr td:nth-child(2)')][:10]

In [179]:
pd.DataFrame({
    'language' : top10_languages,
    'nb_native_speakers': natives_speakers
})

Unnamed: 0,language,nb_native_speakers
0,Mandarin Chinese,920.0
1,Spanish,475.0
2,English,373.0
3,Hindi,344.0
4,Bengali,234.0
5,Portuguese,232.0
6,Russian,154.0
7,Japanese,125.0
8,Yue Chinese,85.2
9,Vietnamese,84.6
