In [1]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd


### 1. A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'

In [2]:
# 1. Url from the webpage
url = 'https://www.data.gov.uk/'

In [3]:
# 2. download html with a get request

response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
# 3.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
# 3.2. check that the html code looks like it should
#soup

In [6]:
# 5. retrieve/extract the desired info (here, you'll paste the "Selector")

soup.select('ul > li > h3 > a')


# Below is the full selector of the item. With trial and error we get the shorter we need
#'#main-content > div:nth-child(3) > div > ul > li:nth-child(10) > h3 > a'


[<a class="govuk-link" href="/search?filters%5Btopic%5D=Business+and+economy">Business and economy</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Crime+and+justice">Crime and justice</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Defence">Defence</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Education">Education</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Environment">Environment</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Government">Government</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Government+spending">Government spending</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Health">Health</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Mapping">Mapping</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Society">Society</a>,
 <a class="govuk-link" href="/search?filters%5Btopic%5D=Towns+and+cities">Towns and cities</a>,
 <a class="govuk-link" href="/search?f

In [7]:
# Now we need to get the text from each line. We´ll do it in the following steps

#initialize empty lists
topics = []

# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set

num_iter = len(soup.select("ul > li > h3 > a"))

topics_list = soup.select("ul > li > h3 > a")

# # iterate through the result set and retrive all the data
for i in range(num_iter):
    topics.append(topics_list[i].get_text())

print(topics)


['Business and economy', 'Crime and justice', 'Defence', 'Education', 'Environment', 'Government', 'Government spending', 'Health', 'Mapping', 'Society', 'Towns and cities', 'Transport', 'Digital service performance', 'Government reference data']


In [8]:
# Now we create a dataset with the values 

datasets = pd.DataFrame({'topics':topics})

In [9]:
datasets.head(15)

Unnamed: 0,topics
0,Business and economy
1,Crime and justice
2,Defence
3,Education
4,Environment
5,Government
6,Government spending
7,Health
8,Mapping
9,Society


### 2. Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'

In [10]:
url2 = 'https://www.fbi.gov/wanted/topten'

In [11]:
response2 = requests.get(url2)
response2.status_code # 200 status code means OK!

200

In [12]:
soup2 = BeautifulSoup(response2.content, "html.parser")

In [13]:
#soup2

In [14]:
soup2.select("h3>a")

[<a href="https://www.fbi.gov/wanted/topten/omar-alexander-cardenas">OMAR ALEXANDER CARDENAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/alexis-flores">ALEXIS FLORES</a>,
 <a href="https://www.fbi.gov/wanted/topten/yulan-adonay-archaga-carias">YULAN ADONAY ARCHAGA CARIAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/bhadreshkumar-chetanbhai-patel">BHADRESHKUMAR CHETANBHAI PATEL</a>,
 <a href="https://www.fbi.gov/wanted/topten/alejandro-castillo">ALEJANDRO ROSALES CASTILLO</a>,
 <a href="https://www.fbi.gov/wanted/topten/ruja-ignatova">RUJA IGNATOVA</a>,
 <a href="https://www.fbi.gov/wanted/topten/arnoldo-jimenez">ARNOLDO JIMENEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/jose-rodolfo-villarreal-hernandez">JOSE RODOLFO VILLARREAL-HERNANDEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/michael-james-pratt">MICHAEL JAMES PRATT</a>,
 <a href="https://www.fbi.gov/wanted/topten/rafael-caro-quintero">RAFAEL CARO-QUINTERO</a>]

In [15]:
# We repeat the same steps than in the previous example


#initialize empty lists
names = []

# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup2.select("h3>a"))

names_list = soup2.select("h3>a")

# # iterate through the result set and retrive all the data
for i in range(num_iter):
    names.append(names_list[i].get_text())

print(names)

['OMAR ALEXANDER CARDENAS', 'ALEXIS FLORES', 'YULAN ADONAY ARCHAGA CARIAS', 'BHADRESHKUMAR CHETANBHAI PATEL', 'ALEJANDRO ROSALES CASTILLO', 'RUJA IGNATOVA', 'ARNOLDO JIMENEZ', 'JOSE RODOLFO VILLARREAL-HERNANDEZ', 'MICHAEL JAMES PRATT', 'RAFAEL CARO-QUINTERO']


In [16]:
fbi_most_wanted = pd.DataFrame({'names':names})
fbi_most_wanted.head(10)

Unnamed: 0,names
0,OMAR ALEXANDER CARDENAS
1,ALEXIS FLORES
2,YULAN ADONAY ARCHAGA CARIAS
3,BHADRESHKUMAR CHETANBHAI PATEL
4,ALEJANDRO ROSALES CASTILLO
5,RUJA IGNATOVA
6,ARNOLDO JIMENEZ
7,JOSE RODOLFO VILLARREAL-HERNANDEZ
8,MICHAEL JAMES PRATT
9,RAFAEL CARO-QUINTERO


In [23]:
# convert names into lowecase

def lowercase(df):

    df['names'] =[name.lower() for name in df['names']]
    
    return df

In [24]:
lowercase(fbi_most_wanted)

Unnamed: 0,names
0,omar alexander cardenas
1,alexis flores
2,yulan adonay archaga carias
3,bhadreshkumar chetanbhai patel
4,alejandro rosales castillo
5,ruja ignatova
6,arnoldo jimenez
7,jose rodolfo villarreal-hernandez
8,michael james pratt
9,rafael caro-quintero
