In [1]:
from pandas import set_option
from pyspark.sql import SparkSession

set_option('display.max.columns', None)

spark = (
    SparkSession
    .builder
    .config('spark.jars.packages', 'com.microsoft.azure:spark-mssql-connector_2.12:1.1.0')
    .getOrCreate()
)

In [6]:
from pyspark.sql import DataFrame
from typing import Optional

# limit characters for writing to database
def varchar(dataFrame: DataFrame, width: int=1024) -> Optional[str]:
    columns = [f'{name} VARCHAR ({width})' for name, dtype in dataFrame.dtypes if dtype == 'string']
    return ', '.join(columns) if columns else None

In [11]:
import os

PATH = '/tmp/statements'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

In [17]:
import os

writeOptions = {
    'url': 'jdbc:sqlserver://',
    'server': 'localhost',
    'database': 'DataWarehouse',
    'user': 'SA',
    'password': os.environ['PASSWORD']
}

In [40]:
%%time
names = os.listdir(SILVER)

for name in names:
    path = os.path.join(SILVER, name)
    spark.read.parquet(path).createOrReplaceTempView(name)

CPU times: user 19 ms, sys: 5.88 ms, total: 24.8 ms
Wall time: 1.04 s


['activities',
 'bridge',
 'calls',
 'contacts',
 'emails',
 'employee_calls',
 'employees',
 'requests',
 'statement_activities',
 'statement_activity_call_bridge',
 'statement_emails_recieved',
 'statement_requests',
 'statements',
 'statements_recieved',
 'vendor_contacts']

In [23]:
%%time

nameMap = {
    'activities': 'STATEMENT_ACTIVITIES',
    'bridge': 'STATEMENT_ACTIVITY_CALL_BRIDGE',
    'calls': 'EMPLOYEE_CALLS',
    'contacts': 'VENDOR_CONTACTS',
    'emails': 'STATEMENT_EMAILS_RECIEVED',
    'employees': 'EMPLOYEES',
    'requests': 'STATEMENT_REQUESTS',
    'statements': 'STATEMENTS_RECIEVED'
}

for name, tableName in nameMap.items():
    table = spark.table(name)
    records = table.count()
    print(f'table {name} contains {records:,} records')

    columnTypes = varchar(table, 256)
    options = {**writeOptions, 'createTableColumnTypes': columnTypes} if columnTypes else writeOptions

    (
        table
        .write
        .format('com.microsoft.sqlserver.jdbc.spark')
        .options(**options)
        .option('dbtable', tableName)
        .mode('overwrite')
        # .save()
    )

table activities contains 211,695 records
table bridge contains 1,572 records
table calls contains 2,530 records
table contacts contains 303,660 records
table emails contains 137,909 records
table employees contains 468 records
table requests contains 533,579 records
table statements contains 234,527 records
CPU times: user 38.2 ms, sys: 8.52 ms, total: 46.7 ms
Wall time: 1.05 s


In [42]:
# for name, tableName in nameMap.items():
#     if name.upper() != tableName:
#         spark.read.table(name).createOrReplaceTempView(tableName)
#         spark.catalog.dropTempView(name)

# [table.name for table in spark.catalog.listTables()]

In [20]:
(
    spark
    .read
    .parquet('/tmp/statements/bronze/StatementRequestActivityRecords')
    .groupBy('ActivityType')
    .count()
    .toPandas()
)

Unnamed: 0,ActivityType,count
0,,2031727
1,Called Vendor,201735
2,Note Only,77859
3,Emailed Vendor,15315
4,Received Call / Email,12992


In [30]:
# (
#     spark
#     .read
#     .parquet('/tmp/statements/bronze/StatementRequestActivityRecords')
#     .limit(5)
#     .toPandas()
# )

#### [DATEADD](https://www.w3schools.com/sql/func_sqlserver_dateadd.asp)

#### [FORMAT](https://www.mssqltips.com/sqlservertip/2655/format-sql-server-dates-with-format-function/)

#### [CONVERT](https://www.w3schools.com/sql/func_sqlserver_convert.asp)

```sql
SELECT FORMAT(DATEADD(SECOND, 30, '1900-01-01'), 'HH:mm:ss');

SELECT GETDATE();

SELECT CONVERT(datetime, '2022-06-09 07:35:00', 121);

SELECT FORMAT(CONVERT(datetime, '2022-06-09 13:35:00', 121), 'hhtt');

SELECT DATEPART(HOUR, GETDATE());

```

- EmployeeName
- InboundCalls
    - count(*)
- OutboundCalls
    - count(*)
- AverageInboundDuration
- AverageOutboundDuration
- StatementsReceived
- EmailsReceived
    

In [45]:
query = """
SELECT
    EMPLOYEE_ID,
    EMPLOYEE_NAME,
    ROLE
FROM
    EMPLOYEES
"""

(
    spark
    .sql(query)
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_ID,EMPLOYEE_NAME,ROLE
0,194728,Andi Prins,Audit Supervisor
1,194729,Bob VanGoor,Audit Supervisor
2,194730,Colleen Kretowicz,Audit Supervisor
3,194731,Cindy Allen,WNC Auditor
4,194732,Dan Hutchins,Audit Manager


In [89]:
query = """
WITH
CALLS AS (
    SELECT
        EMPLOYEE_ID,
        CALL_DATE,
        CASE IS_OUTGOING
            WHEN false
                THEN CALL_DURATION
        END as INBOUND_DURATION,
        CASE IS_OUTGOING
            WHEN true
                THEN CALL_DURATION
        END as OUTBOUND_DURATION
    FROM
        EMPLOYEE_CALLS
    WHERE
        CALL_RESULT = 'Call connected'
        AND CALL_DATE BETWEEN
            current_timestamp() - interval '1 day'
            AND current_timestamp()
)
SELECT
    EMPLOYEE_ID,
    date_format(
        to_utc_timestamp(
            from_unixtime(
                coalesce(cast(floor(sum(INBOUND_DURATION)) as int), 0),
                'yyyy-MM-dd HH:mm:ss'
            ),
            'America/New_York'
        ),
        'HH:mm:ss'
    ) as INBOUND_TIME,
    date_format(
        to_utc_timestamp(
            from_unixtime(
                coalesce(cast(floor(sum(OUTBOUND_DURATION)) as int), 0),
                'yyyy-MM-dd HH:mm:ss'
            ),
            'America/New_York'
        ),
        'HH:mm:ss'
    ) as OUTBOUND_TIME,
    count(INBOUND_DURATION) as INBOUND_CALLS,
    count(OUTBOUND_DURATION) as OUTBOUND_CALLS,
    MIN(CALL_DATE) as EARLIEST_CALL_DATE,
    MAX(CALL_DATE) as LATEST_CALL_DATE
FROM
    CALLS
GROUP BY
    EMPLOYEE_ID
"""

(
    spark
    .sql(query)
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_ID,INBOUND_TIME,OUTBOUND_TIME,INBOUND_CALLS,OUTBOUND_CALLS,EARLIEST_CALL_DATE,LATEST_CALL_DATE
0,12611481,00:14:21,00:00:00,1,0,2022-06-08 09:40:22,2022-06-08 09:40:22
1,26118045,00:00:00,00:11:50,0,3,2022-06-08 10:50:57,2022-06-08 11:33:18
2,46530851,00:00:00,00:26:24,0,22,2022-06-08 09:35:41,2022-06-08 12:28:13
3,32302927,00:00:00,00:19:24,0,20,2022-06-08 09:48:25,2022-06-08 13:47:48
4,22196308,00:00:00,00:01:13,0,2,2022-06-08 12:30:31,2022-06-08 12:30:58


```sql
WITH
STATEMENTS AS(
    SELECT
        REFERENCE_ID,
        EMAIL_ID,
        FORMAT(STATEMENT_DATE, 'hhtt') AS BUCKET
    FROM
        STATEMENTS_RECIEVED
    WHERE
        STATEMENT_DATE BETWEEN
            CAST(GETDATE() - 1 AS DATE)
            AND CAST(GETDATE() AS DATE)
)
SELECT *
FROM
    STATEMENTS
PIVOT (
    COUNT(EMAIL_ID)
    FOR BUCKET IN (
        [12AM],
        [01AM],
        [02AM],
        [03AM],
        [04AM],
        [05AM],
        [06AM],
        [07AM],
        [08AM],
        [09AM],
        [10AM],
        [11AM],
        [12PM],
        [01PM],
        [02PM],
        [03PM],
        [04PM],
        [05PM],
        [06PM],
        [07PM],
        [08PM],
        [09PM],
        [10PM],
        [11PM]
    )
) AS PIVOT_TABLE
```

In [159]:
query = """
WITH
STATEMENTS AS (
    SELECT
        REFERENCE_ID,
        count(*) as RECEIVED_STATEMENTS,
        sum(CASE BUCKET WHEN '12AM' THEN 1 ELSE 0 END) as 12AM,
        sum(CASE BUCKET WHEN '01AM' THEN 1 ELSE 0 END) as 01AM,
        sum(CASE BUCKET WHEN '02AM' THEN 1 ELSE 0 END) as 02AM,
        sum(CASE BUCKET WHEN '03AM' THEN 1 ELSE 0 END) as 03AM,
        sum(CASE BUCKET WHEN '04AM' THEN 1 ELSE 0 END) as 04AM,
        sum(CASE BUCKET WHEN '05AM' THEN 1 ELSE 0 END) as 05AM,
        sum(CASE BUCKET WHEN '06AM' THEN 1 ELSE 0 END) as 06AM,
        sum(CASE BUCKET WHEN '07AM' THEN 1 ELSE 0 END) as 07AM,
        sum(CASE BUCKET WHEN '08AM' THEN 1 ELSE 0 END) as 08AM,
        sum(CASE BUCKET WHEN '09AM' THEN 1 ELSE 0 END) as 09AM,
        sum(CASE BUCKET WHEN '10AM' THEN 1 ELSE 0 END) as 10AM,
        sum(CASE BUCKET WHEN '11AM' THEN 1 ELSE 0 END) as 11AM,
        sum(CASE BUCKET WHEN '12PM' THEN 1 ELSE 0 END) as 12PM,
        sum(CASE BUCKET WHEN '01PM' THEN 1 ELSE 0 END) as 01PM,
        sum(CASE BUCKET WHEN '02PM' THEN 1 ELSE 0 END) as 02PM,
        sum(CASE BUCKET WHEN '03PM' THEN 1 ELSE 0 END) as 03PM,
        sum(CASE BUCKET WHEN '04PM' THEN 1 ELSE 0 END) as 04PM,
        sum(CASE BUCKET WHEN '05PM' THEN 1 ELSE 0 END) as 05PM,
        sum(CASE BUCKET WHEN '06PM' THEN 1 ELSE 0 END) as 06PM,
        sum(CASE BUCKET WHEN '07PM' THEN 1 ELSE 0 END) as 07PM,
        sum(CASE BUCKET WHEN '08PM' THEN 1 ELSE 0 END) as 08PM,
        sum(CASE BUCKET WHEN '09PM' THEN 1 ELSE 0 END) as 09PM,
        sum(CASE BUCKET WHEN '10PM' THEN 1 ELSE 0 END) as 10PM,
        sum(CASE BUCKET WHEN '11PM' THEN 1 ELSE 0 END) as 11PM
    FROM (
        SELECT
            REFERENCE_ID,
            date_format(STATEMENT_DATE, 'hha') as BUCKET
        FROM
            STATEMENTS_RECIEVED
        WHERE
            STATEMENT_DATE BETWEEN
                current_timestamp() - interval '1 day'
                AND current_timestamp()
    )
    GROUP BY
        REFERENCE_ID
)
SELECT *
FROM
    STATEMENTS
"""

(
    spark
    .sql(query)
    # .count()
    # .sample(False, 1e-1)
    .limit(10)
    .toPandas()
)

Unnamed: 0,REFERENCE_ID,RECEIVED_STATEMENTS,12AM,01AM,02AM,03AM,04AM,05AM,06AM,07AM,08AM,09AM,10AM,11AM,12PM,01PM,02PM,03PM,04PM,05PM,06PM,07PM,08PM,09PM,10PM,11PM
0,1231710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1270303,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
2,1247997,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,1280330,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,1204399,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,1165339,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
6,1305138,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,1341102,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
8,1222088,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,1319865,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [157]:
query = "SELECT date_format(to_timestamp('2022-06-08 09:57:35'), 'hha') as formatted"

spark.sql(query).show()

+---------+
|formatted|
+---------+
|     09AM|
+---------+



In [161]:
query = """
WITH
EMAILS AS (
    SELECT
        REFERENCE_ID,
        count(EMAIL_DATE) as RECEIVED_EMAILS
    FROM
        STATEMENT_EMAILS_RECIEVED
    WHERE
        EMAIL_DATE BETWEEN
            current_timestamp() - interval '1 day'
            AND current_timestamp()
    GROUP BY
        REFERENCE_ID
)
SELECT *
FROM
    EMAILS
"""

(
    spark
    .sql(query)
    .limit(10)
    .toPandas()
)

Unnamed: 0,REFERENCE_ID,RECEIVED_EMAILS
0,1239143,1
1,1238171,1
2,1276989,1


In [None]:
spark.stop()