In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
table_attribs = ["Country", "GDP_USD_millions"]
db_name = 'Word_Economies.db'
table_name = 'Countries_by_GDP'
csv_path = './Countries_by_GDP.csv'

In [None]:
def extract(url, table_attribs):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = soup.find_all('table')
    rows = tables[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col) >= 2:
            if col[0].find('a') is not None and '—' not in col[1].get_text(strip=True):
                data_dict = {
                    "Country": col[0].get_text(strip=True),
                    "GDP_USD_millions": col[1].get_text(strip=True)
                }
                df1 = pd.DataFrame(data_dict, index=[0])
                df = pd.concat([df, df1], ignore_index=True)
    return df

In [None]:
df = extract(url, table_attribs)

In [None]:
def transform(df):
    GDP_list = df["GDP_USD_millions"].tolist()
    GDP_list = [float("".join(x.split(','))) for x in GDP_list]
    GDP_list = [np.round(x / 1000, 2) for x in GDP_list]
    df["GDP_USD_millions"] = GDP_list
    df = df.rename(columns={"GDP_USD_millions": "GDP_USD_billions"})
    return df

In [None]:
df = transform(df)

In [None]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

In [None]:
load_to_csv(df, csv_path)

In [None]:
sql_connection = sqlite3.connect(db_name)
def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)
load_to_db(df, sql_connection, table_name)

In [None]:
def log_pogress(message):
    timestamp_format = "%Y-%h-%d-%H:%M:%S"
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open("./etl_project_log.txt", "a") as f:
        f.write(timestamp + ' : ' + message + '\n')

In [None]:
query_statement = f"SELECT * FROM {table_name} WHERE GDP_USD_billions >= 100"
def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

log_pogress('Data loaded to Database as table. Running the query')
run_query(query_statement, sql_connection)

In [None]:
sql_connection.close()