In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime

In [2]:
url ='https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'
table_attribs = ["Rank ", "Name", "Industry", "Revenue_USD_millions", "Revenue_growth_%", "Employees","Headquarters"]
db_name = 'Largest_Companies.db'
table_name = 'companies'
csv_path = './Largest_companies.csv'

def extract(url, table_attribs):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    tables = soup.find_all('table')
    df = pd.DataFrame(columns=table_attribs)
    rows = tables[1].find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) != 0:
            data = [col.text.strip() for col in cols]       
            df.loc[len(df)] = data
    return df

In [3]:
df = extract(url, table_attribs)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Rank                  100 non-null    object
 1   Name                  100 non-null    object
 2   Industry              100 non-null    object
 3   Revenue_USD_millions  100 non-null    object
 4   Revenue_growth_%      100 non-null    object
 5   Employees             100 non-null    object
 6   Headquarters          100 non-null    object
dtypes: object(7)
memory usage: 6.2+ KB


In [5]:
def transform(df):
    for col in df.columns[3:6]:
        df[col] = df[col].map(
            lambda x: float(str(x).replace(',', '').replace('%', ''))
        )
    return df

In [6]:
df = transform(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rank                  100 non-null    object 
 1   Name                  100 non-null    object 
 2   Industry              100 non-null    object 
 3   Revenue_USD_millions  100 non-null    float64
 4   Revenue_growth_%      100 non-null    float64
 5   Employees             100 non-null    float64
 6   Headquarters          100 non-null    object 
dtypes: float64(3), object(4)
memory usage: 6.2+ KB


In [7]:
def load_to_csv(df, csv_path):
    df.to_csv('Largest_companies.csv', index=False)

sql_connection = sqlite3.connect(db_name)

def load_to_sql(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)


In [None]:
load_to_csv(df, csv_path)
load_to_sql(df, sql_connection, table_name)

In [12]:
def log_progress(message):
    timestamp_format = "%Y-%h-%d-%H:%M:%S"
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open("./etl_project_log.txt", "a") as f:
        f.write(timestamp + " : " + message + "\n")

In [13]:
def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

query_statement = f"SELECT * from {table_name} WHERE [Revenue_growth_%] > 30"

log_progress("Running the query")
run_query(query_statement, sql_connection)


SELECT * from companies WHERE [Revenue_growth_%] > 30
  Rank              Name            Industry  Revenue_USD_millions  \
0    12   JPMorgan Chase  Financial services              239425.0   
1    18  Bank of America          Financials              171912.0   
2    21        Citigroup          Financials              156820.0   
3    34      Wells Fargo          Financials              115340.0   
4    35    Goldman Sachs          Financials              108418.0   
5    41   Morgan Stanley          Financials               96194.0   
6    65           Nvidia          Technology               60922.0   

   Revenue_growth_%  Employees               Headquarters  
0              54.7   309926.0    New York City, New York  
1              49.4   212985.0  Charlotte, North Carolina  
2              55.1   237925.0    New York City, New York  
3              39.2   226000.0  San Francisco, California  
4              57.8    45300.0    New York City, New York  
5              45.9    80

In [14]:
sql_connection.close()