In [65]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [84]:
url = "https://data.worldbank.org/country"

In [85]:
def get_country_name(url):
    """
    This function takes a world bank site URL as input and scrapes the corresponding webpage 
    to extract the names of all the countries listed on the page. It returns a 
    list of country names.

    Parameters:
    url (str): The URL of the webpage to scrape.

    Returns:
    list: A list of strings, where each string represents the name of a country.
    """
    content = requests.get(url)
    soup = BeautifulSoup(content.text, 'lxml')

    #locating the required content from the main div
    main_div = soup.find('div', {'class': 'wrapper',"id":"main"})
    sub_main_div = main_div.find('div', {'class': 'overviewArea body'})
    all_a = sub_main_div.find_all('a')

    countries = []
    for a in all_a:
        countries.append(a.text)
    
    return countries



In [86]:
def fetch_countries_data(countries):
    countries_data = []
    for country in countries:
        country = country.lower()
        try:
            modified_url = url + "/" + country + "?view=chart"
            content = requests.get(modified_url)
            soup = BeautifulSoup(content.text, 'lxml')
            download_div = soup.find("div",{'class':'btn-item download'})
            excel_download_link = download_div.find_all('a')[2].get("href")
            df = pd.read_excel(excel_download_link,skiprows=3,sheet_name='Data')
            countries_data.append(df)
            print(f"Successfuly extracted data for {country.title()}.") 
        except:
            print(f"Failed to extract data for {country.title()}.")             
    return countries_data

In [87]:
def relevant_indicators(keyword,countries_data):
    all_relevant_indicators = []
    for df in countries_data:
        df_relevant_indicators = []
        for indicator in df["Indicator Name"]:
            if keyword in indicator:
                df_relevant_indicators.append(indicator)
        all_relevant_indicators.append(df_relevant_indicators)
    return all_relevant_indicators
        

In [88]:
def data_extractor(countries_data,all_relevant_indicators,years):
    final_data ={}
    for i in range(len(countries)):
        df = countries_data[i][["Indicator Name"] + years]
        df = df[df["Indicator Name"].isin(all_relevant_indicators[i])]
        final_data[f"{countries[i]}"]=df.dropna(subset=years,how="all").reset_index(drop=True)
    return final_data

In [89]:
def compile_pipeline(countries,keyword,years):
    countries_data = fetch_countries_data(countries)
    all_relevant_indicators = relevant_indicators(keyword,countries_data)
    final_data  = data_extractor(countries_data,all_relevant_indicators,years)
    return final_data
    

In [90]:
all_countries = get_country_name(url)  # to get list of all the countries whose data is available 

In [95]:
countries = ["India"]
keyword = "trade"
years = ["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022","2023"]

In [96]:
final_data = compile_pipeline(countries,keyword,years)

Successfuly extracted data for India.
