# Extract all tables in `sales` schema

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pyspark.sql.functions import lit
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor

In [3]:
with open('config.json', 'r') as f:
    config = json.load(f)

## Control variables

In [4]:
schema = config["general"]["schema"]
catalog = f'{config["general"]["user"]}_raw'
db_password = dbutils.secrets.get(scope="antonio_junior_adw", key="pswd_mssql")
db_host = dbutils.secrets.get(scope="antonio_junior_adw", key="ip_mssql")
db_port = dbutils.secrets.get(scope="antonio_junior_adw", key="port_mssql")
db_user = config["general"]["db_user"]
database =  config["general"]["database"]
tables_list = list(config["tables"].keys())
last_modified_date = datetime.today() - timedelta(days=7)
last_modified_date = last_modified_date.strftime('%Y-%m-%d 00:00:00')
last_modified_date = "2007-01-01 00:00:00"

Connections vars

In [5]:
jdbc_url = f"jdbc:sqlserver://{db_host}:{db_port};databaseName={database};encrypt=true;trustServerCertificate=true;"

connection_properties = {
    "user": db_user, 
    "password": db_password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

Create schema

In [6]:
create_schema = f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}"
spark.sql(create_schema)

## Extraction steps

Extract all data from each table.

In [7]:
def extract_data(table_name):
    delta_table_name = f"{catalog}.{schema}.{table_name}"
    print("Extracting data to:", table_name)

    query = f"(SELECT * FROM {schema}.{table_name} WHERE ModifiedDate >= '{last_modified_date}') AS subquery"

    incremental_df = spark.read.jdbc(
        url=jdbc_url,
        table=query,
        properties=connection_properties
    )

    incremental_df = incremental_df.withColumn("extract_date", lit(datetime.today()))

    incremental_df.write.mode("overwrite").format("delta").saveAsTable(delta_table_name)

In [10]:
with ThreadPoolExecutor(max_workers=8) as executor:
    executor.map(extract_data, tables_list)

Extracting data to: CountryRegionCurrency
Extracting data to: Currency
Extracting data to: CurrencyRate
Extracting data to: Customer
Extracting data to: PersonCreditCard
Extracting data to: SalesOrderDetail
Extracting data to: SalesOrderHeader
Extracting data to: SalesOrderHeaderSalesReason
Extracting data to: SalesPerson
Extracting data to: SalesPersonQuotaHistory
Extracting data to: SalesReason
Extracting data to: SalesTaxRate
Extracting data to: SalesTerritory
Extracting data to: SalesTerritoryHistory
Extracting data to: ShoppingCartItem
Extracting data to: SpecialOffer
Extracting data to: SpecialOfferProduct
Extracting data to: Store
