#### Acquiring and Processing Information on the World's Largest Banks

You have been hired as a data engineer by research organization. Your boss has asked you to create a code that can be used to compile the list of the top 10 largest banks in the world ranked by market capitalization in billion USD. Further, the data needs to be transformed and stored in USD, GBP, EUR and INR as well, in accordance with the exchange rate information that has been made available to you as a CSV file. The processed information table is to be saved locally in a CSV format and as a database table.

Your job is to create an automated system to generate this information so that the same can be executed in every financial quarter to prepare the report.

##### Extract the data 

In [2]:
# Import the libraries for web scraping and data manipulation
import requests
import sqlite3
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
print("Project libraries have been imported successfully")

# Store final output data and all logs
log_file = "code_log.txt"
target_file = "Largest_banks_data.csv"

# Initialize all known entities
data_url = 'https://en.wikipedia.org/wiki/List_of_largest_banks'
csv_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-PY0221EN-Coursera/labs/v2/exchange_rate.csv'
table_attribs = ["Name", "MC_USD_Billions"]
table_attribs_final = ["Name", "MC_USD_Billion", "MC_GBP_Billion", "MC_EUR_Billion", "MC_INR_Billion"]

db_name = "Banks.db"
table_name = "Largest_banks"
csv_path = '~/Documents/IBM-Data-Engineering-Professional/Course 3 - Python Project for Data Engineering/Extract, Transform and Load Market Capitalization data/Largest_banks_data.csv'

# # Useful functions for ETL operations on Market capitalization data
# # Extraction
# def extract(data_url, table_attribs):
#     html_page = requests.get(url).text
#     data = BeautifulSoup(html_page, 'html.parser')

#     # Using find_all() function
#     extracted_data = pd.DataFrame(columns = table_attribs)
#     tables = data.find_all("tbody")
#     rows = tables[2].find_all("tr")
#     for row in rows:
#         col = row.find_all("td")
#         if len(col) != 0: 
#             if col[0].find('a') is not None and "—" not in col[2]:
#                 data_dict = {"Country": col[0].a.contents[0], 
#                             "GDP_USD_millions": col[2].contents[0]}
#                 df1 = pd.DataFrame(data_dict, index=[0])
#                 extracted_data = pd.concat([extracted_data, df1], ignore_index = True)
#     return extracted_data


# # Transformation
# def transform(df): 
#     ''' This function converts the GDP information from Currency
#     format to float value, transforms the information of GDP from
#     USD (Millions) to USD (Billions) rounding to 2 decimal places.
#     The function returns the transformed dataframe.'''
#     estimate = df["GDP_USD_millions"].values.tolist()
#     estimate = [round(int(i.replace(',', '')) / 1000, 2) for i in estimate]
#     df_estimate = pd.DataFrame(estimate, columns=['GDP_USD_billions'])
#     df = df.join(df_estimate)
#     return df[["Country", "GDP_USD_billions"]] 

# # Loading and Logging
# def load_to_csv(df, csv_path):
#     ''' This function saves the final dataframe as a `CSV` file 
#     in the provided path. Function returns nothing.'''
#     df.to_csv(csv_path)

# def load_to_db(df, sql_connection, table_name):
#     ''' This function saves the final dataframe as a database table
#     with the provided name. Function returns nothing.'''
#     df.to_sql(table_name, sql_connection, if_exists= "replace", index = False)
    
# def run_query(query_statement, sql_connection):
#     ''' This function runs the stated query on the database table and
#     prints the output on the terminal. Function returns nothing. '''
#     print(query_statement)
#     return pd.read_sql(query_statement, sql_connection)
    
def log_progress(message):
    ''' This function logs the mentioned message at a given stage of the code execution to a log file. Function returns nothing'''
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./code_log.txt", "a") as f: 
        f.write(timestamp + ', ' + message + '\n')   


Project libraries have been imported successfully


In [2]:
# Testing ETL operations and log progress
# Log the initialization of the ETL process 
log_progress("Preliminaries complete. Initiating ETL process")
extracted_data = extract(url, table_attribs)
 
# Log the completion of the Extraction process and begin transformation process
log_progress("Data extraction complete. Initiating Transformation process")
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process and begin loading process
log_progress("Data transformation complete. Initiating loading process")
load_to_csv(transformed_data, csv_path)

log_progress("Data saved to CSV file") 

# Use SQLite3 to create and connect to a new database World_Economies.db
sql_connection = sqlite3.connect(db_name)

log_progress("SQL Connection initiated")

load_to_db(transformed_data, sql_connection, table_name)

log_progress("Data loaded to Database as table. Running the query")

query_statement = f"SELECT * FROM {table_name} WHERE GDP_USD_billions >= 100"
print(run_query(query_statement, sql_connection))
 
# Log the completion of the process 
log_progress("Process Complete") 

sql_connection.close()

Transformed Data
              Country  GDP_USD_billions
0       United States          26854.60
1               China          19373.59
2               Japan           4409.74
3             Germany           4308.85
4               India           3736.88
..                ...               ...
186  Marshall Islands              0.29
187             Palau              0.26
188          Kiribati              0.25
189             Nauru              0.15
190            Tuvalu              0.07

[191 rows x 2 columns]
SELECT * FROM Countries_by_GDP WHERE GDP_USD_billions >= 100
          Country  GDP_USD_billions
0   United States          26854.60
1           China          19373.59
2           Japan           4409.74
3         Germany           4308.85
4           India           3736.88
..            ...               ...
64          Kenya            118.13
65         Angola            117.88
66           Oman            104.90
67      Guatemala            102.31
68       Bulgaria     

In [None]:
def extract(data_url, table_attribs):
    html_page = requests.get(url).text
    data = BeautifulSoup(html_page, 'html.parser')

    # Using find_all() function
    extracted_data = pd.DataFrame(columns = table_attribs)
    tables = data.find_all("tbody")
    rows = tables[2].find_all("tr")
    for row in rows:
        col = row.find_all("td")
        if len(col) != 0: 
            if col[0].find('a') is not None and "—" not in col[2]:
                data_dict = {"Country": col[0].a.contents[0], 
                            "GDP_USD_millions": col[2].contents[0]}
                df1 = pd.DataFrame(data_dict, index=[0])
                extracted_data = pd.concat([extracted_data, df1], ignore_index = True)
    return extracted_data

In [75]:
html_page = requests.get(data_url).text
data = BeautifulSoup(html_page, 'html.parser')
# Using find_all() function
extracted_data = pd.DataFrame(columns = table_attribs)
tables = data.find_all("tbody")
rows = tables[0].find_all("tr")
for row in rows:
    col = row.find_all("td")
    if len(col) != 0:
        data_dict = {"Name": col[1].contents[2].contents,
                    "MC_USD_Billions": col[2].contents[0]}
        df1 = pd.DataFrame(data_dict, index = [0])
        extracted_data = pd.concat([extracted_data, df1], ignore_index = True)
print(extracted_data)


                                      Name MC_USD_Billions
0                           JPMorgan Chase        419.25\n
1                          Bank of America        231.52\n
2  Industrial and Commercial Bank of China        194.56\n
3               Agricultural Bank of China        160.68\n
4                                HDFC Bank        157.91\n
5                              Wells Fargo        155.87\n
6                        HSBC Holdings PLC        148.90\n
7                           Morgan Stanley        140.83\n
8                  China Construction Bank        139.82\n
9                            Bank of China        136.81\n
