In [34]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [35]:
url="https://web.archive.org/web/20200318083015/https://en.wikipedia.org/wiki/List_of_largest_banks"

## Copy the page using url

In [36]:
def copy_page(url):
    """copy the specified page using specified url"""
    response=requests.get(url)
    if response.status_code>=200 and response.status_code<=299:
        page_content=response.text
        with open('List-of-largest-banks.html','w',encoding="utf-8") as file:
            file.write(page_content) 
    return "list-of-largest-banks downloaded"

## Parse the content using Beautiful soup

In [37]:
def parse(file):
    """reading the content from the file and we use BEAUTIFULSOUP object to parse the content"""
    with open(file,'r') as f:
        html_source=f.read()
    doc=BeautifulSoup(html_source,'html.parser')
    return doc

## Extracting data from doc to scrape data from table 'By market capitalization'

In [38]:
def extract(doc):
    data = pd.DataFrame(columns=["Name", "Market Cap (US$ Billion)"])

    for row in doc.find_all('tbody')[2].find_all('tr'):
        col = row.find_all('td')
        if len(col)==3:
            Bank_name=[col[1].text.strip()]
            Market_cap=[col[2].text.strip()]
            New_data=pd.DataFrame({"Name":Bank_name, "Market Cap (US$ Billion)":Market_cap})
            data = pd.concat([data, New_data], ignore_index=True)

    return data


## loading the data to json

In [39]:
def load(data,targetfile):
    data.to_json(f"{targetfile}.json",orient = 'records',indent=5)

## Working:

In [40]:
copy_page(url)

'list-of-largest-banks downloaded'

In [41]:
doc=parse('List-of-largest-banks.html')

In [42]:
df=extract(doc)

In [43]:
df.head()

Unnamed: 0,Name,Market Cap (US$ Billion)
0,JPMorgan Chase,390.934
1,Industrial and Commercial Bank of China,345.214
2,Bank of America,325.331
3,Wells Fargo,308.013
4,China Construction Bank,257.399


In [44]:
load(df,'market_cap')