In [1]:
from pandas import set_option
from pyspark.sql import SparkSession

set_option('display.max.columns', None)

spark = (
    SparkSession
    .builder
    .config('spark.jars.packages', 'com.microsoft.azure:spark-mssql-connector_2.12:1.1.0')
    .getOrCreate()
)

In [2]:
from pyspark.sql import DataFrame
from typing import Optional

# limit characters for writing to database
def varchar(dataFrame: DataFrame, width: int=1024) -> Optional[str]:
    columns = [f'{name} VARCHAR ({width})' for name, dtype in dataFrame.dtypes if dtype == 'string']
    return ', '.join(columns) if columns else None

In [3]:
import os

PATH = '/tmp/statements'
BRONZE = os.path.join(PATH, 'bronze')
SILVER = os.path.join(PATH, 'silver')

In [4]:
import os

writeOptions = {
    'url': 'jdbc:sqlserver://',
    'server': 'localhost',
    'database': 'DataWarehouse',
    'user': 'SA',
    'password': os.environ['PASSWORD']
}

In [5]:
%%time
names = os.listdir(SILVER)

for name in names:
    path = os.path.join(SILVER, name)
    spark.read.parquet(path).createOrReplaceTempView(name)

CPU times: user 13.6 ms, sys: 4.57 ms, total: 18.1 ms
Wall time: 7.97 s


In [6]:
%%time

nameMap = {
    'activities': 'STATEMENT_ACTIVITIES',
    'bridge': 'STATEMENT_ACTIVITY_CALL_BRIDGE',
    'calls': 'EMPLOYEE_CALLS',
    'contacts': 'VENDOR_CONTACTS',
    'emails': 'STATEMENT_EMAILS_RECIEVED',
    'employees': 'EMPLOYEES',
    'requests': 'STATEMENT_REQUESTS',
    'statements': 'STATEMENTS_RECIEVED'
}

for name, tableName in nameMap.items():
    if not name == 'activities': pass
    table = spark.table(name)
    records = table.count()
    print(f'table {name} contains {records:,} records')

    columnTypes = varchar(table, 256)
    options = {**writeOptions, 'createTableColumnTypes': columnTypes} if columnTypes else writeOptions

    (
        table
        .write
        .format('com.microsoft.sqlserver.jdbc.spark')
        .options(**options)
        .option('dbtable', tableName)
        .mode('overwrite')
        .save()
    )

table activities contains 2,629,428 records
table bridge contains 1,633 records
table calls contains 2,652 records
table contacts contains 303,678 records
table emails contains 279,991 records
table employees contains 936 records
table requests contains 535,364 records
table statements contains 464,673 records
CPU times: user 58.7 ms, sys: 19.2 ms, total: 77.9 ms
Wall time: 2min 34s


In [50]:
for name, tableName in nameMap.items():
    if name.upper() != tableName:
        spark.read.table(name).createOrReplaceTempView(tableName)
        spark.catalog.dropTempView(name)

[table.name for table in spark.catalog.listTables()]

['employee_calls',
 'employees',
 'statement_activities',
 'statement_activity_call_bridge',
 'statement_emails_recieved',
 'statement_requests',
 'statements_recieved',
 'vendor_contacts']

In [181]:
# query = """
# SELECT
#     *,
#     row_number()
#         OVER (
#             PARTITION BY
#                 EMPLOYEE_ID,
#                 REFERENCE_ID
#             ORDER BY
#                 ACTIVITY_DATE DESC
#         ) AS EMPLOYEE_REFERENCE_RANK
# FROM
#     STATEMENT_ACTIVITIES
# """

# (
#     spark
#     .sql(query)
#     .write
#     .mode('overwrite')
#     .parquet('/tmp/activities')
# )

In [9]:
# (
#     spark
#     .read
#     .parquet('/tmp/statements/bronze/StatementRequestActivityRecords')
#     .groupBy('ActivityType')
#     .count()
#     .toPandas()
# )

In [30]:
# (
#     spark
#     .read
#     .parquet('/tmp/statements/bronze/StatementRequestActivityRecords')
#     .limit(5)
#     .toPandas()
# )

#### [DATEADD](https://www.w3schools.com/sql/func_sqlserver_dateadd.asp)

#### [FORMAT](https://www.mssqltips.com/sqlservertip/2655/format-sql-server-dates-with-format-function/)

#### [CONVERT](https://www.w3schools.com/sql/func_sqlserver_convert.asp)

```sql
SELECT FORMAT(DATEADD(SECOND, 30, '1900-01-01'), 'HH:mm:ss');

SELECT GETDATE();

SELECT CONVERT(datetime, '2022-06-09 07:35:00', 121);

SELECT FORMAT(CONVERT(datetime, '2022-06-09 13:35:00', 121), 'hhtt');

SELECT DATEPART(HOUR, GETDATE());

```

- EmployeeName
- InboundCalls
    - count(*)
- OutboundCalls
    - count(*)
- AverageInboundDuration
- AverageOutboundDuration
- StatementsReceived
- EmailsReceived
    

In [51]:
query = """
SELECT
    EMPLOYEE_ID,
    EMPLOYEE_NAME,
    ROLE
FROM
    EMPLOYEES
"""

(
    spark
    .sql(query)
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_ID,EMPLOYEE_NAME,ROLE
0,194728,Andi Prins,Audit Supervisor
1,194729,Bob VanGoor,Audit Supervisor
2,194730,Colleen Kretowicz,Audit Supervisor
3,194731,Cindy Allen,WNC Auditor
4,194732,Dan Hutchins,Audit Manager


In [89]:
query = """
WITH
CALLS AS (
    SELECT
        EMPLOYEE_ID,
        CALL_DATE,
        CASE IS_OUTGOING
            WHEN false
                THEN CALL_DURATION
        END as INBOUND_DURATION,
        CASE IS_OUTGOING
            WHEN true
                THEN CALL_DURATION
        END as OUTBOUND_DURATION
    FROM
        EMPLOYEE_CALLS
    WHERE
        CALL_RESULT = 'Call connected'
        AND CALL_DATE BETWEEN
            current_timestamp() - interval '1 day'
            AND current_timestamp()
)
SELECT
    EMPLOYEE_ID,
    date_format(
        to_utc_timestamp(
            from_unixtime(
                coalesce(cast(floor(sum(INBOUND_DURATION)) as int), 0),
                'yyyy-MM-dd HH:mm:ss'
            ),
            'America/New_York'
        ),
        'HH:mm:ss'
    ) as INBOUND_TIME,
    date_format(
        to_utc_timestamp(
            from_unixtime(
                coalesce(cast(floor(sum(OUTBOUND_DURATION)) as int), 0),
                'yyyy-MM-dd HH:mm:ss'
            ),
            'America/New_York'
        ),
        'HH:mm:ss'
    ) as OUTBOUND_TIME,
    count(INBOUND_DURATION) as INBOUND_CALLS,
    count(OUTBOUND_DURATION) as OUTBOUND_CALLS,
    MIN(CALL_DATE) as EARLIEST_CALL_DATE,
    MAX(CALL_DATE) as LATEST_CALL_DATE
FROM
    CALLS
GROUP BY
    EMPLOYEE_ID
"""

(
    spark
    .sql(query)
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_ID,INBOUND_TIME,OUTBOUND_TIME,INBOUND_CALLS,OUTBOUND_CALLS,EARLIEST_CALL_DATE,LATEST_CALL_DATE
0,12611481,00:14:21,00:00:00,1,0,2022-06-08 09:40:22,2022-06-08 09:40:22
1,26118045,00:00:00,00:11:50,0,3,2022-06-08 10:50:57,2022-06-08 11:33:18
2,46530851,00:00:00,00:26:24,0,22,2022-06-08 09:35:41,2022-06-08 12:28:13
3,32302927,00:00:00,00:19:24,0,20,2022-06-08 09:48:25,2022-06-08 13:47:48
4,22196308,00:00:00,00:01:13,0,2,2022-06-08 12:30:31,2022-06-08 12:30:58


#### SQL Server Pivot Table

```sql
WITH
STATEMENTS AS(
    SELECT
        REFERENCE_ID,
        EMAIL_ID,
        FORMAT(STATEMENT_DATE, 'hhtt') AS BUCKET
    FROM
        STATEMENTS_RECIEVED
    WHERE
        STATEMENT_DATE BETWEEN
            CAST(GETDATE() - 1 AS DATE)
            AND CAST(GETDATE() AS DATE)
)
SELECT *
FROM
    STATEMENTS
PIVOT (
    COUNT(EMAIL_ID)
    FOR BUCKET IN (
        [12AM],
        [01AM],
        [02AM],
        [03AM],
        [04AM],
        [05AM],
        [06AM],
        [07AM],
        [08AM],
        [09AM],
        [10AM],
        [11AM],
        [12PM],
        [01PM],
        [02PM],
        [03PM],
        [04PM],
        [05PM],
        [06PM],
        [07PM],
        [08PM],
        [09PM],
        [10PM],
        [11PM]
    )
) AS PIVOT_TABLE
```

In [138]:
spark.read.table('EMPLOYEES').printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- EMPLOYEE_NAME: string (nullable = true)
 |-- ROLE: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- MANAGER_NAME: string (nullable = true)
 |-- TEAM: string (nullable = true)



In [151]:
query = """
WITH
ACTIVITIES AS (
    SELECT
        EMPLOYEE_ID,
        count(*) as EMPLOYEE_ACTIVITIES,
        sum(CASE BUCKET WHEN '12AM' THEN 1 ELSE 0 END) as 12AM,
        sum(CASE BUCKET WHEN '01AM' THEN 1 ELSE 0 END) as 01AM,
        sum(CASE BUCKET WHEN '02AM' THEN 1 ELSE 0 END) as 02AM,
        sum(CASE BUCKET WHEN '03AM' THEN 1 ELSE 0 END) as 03AM,
        sum(CASE BUCKET WHEN '04AM' THEN 1 ELSE 0 END) as 04AM,
        sum(CASE BUCKET WHEN '05AM' THEN 1 ELSE 0 END) as 05AM,
        sum(CASE BUCKET WHEN '06AM' THEN 1 ELSE 0 END) as 06AM,
        sum(CASE BUCKET WHEN '07AM' THEN 1 ELSE 0 END) as 07AM,
        sum(CASE BUCKET WHEN '08AM' THEN 1 ELSE 0 END) as 08AM,
        sum(CASE BUCKET WHEN '09AM' THEN 1 ELSE 0 END) as 09AM,
        sum(CASE BUCKET WHEN '10AM' THEN 1 ELSE 0 END) as 10AM,
        sum(CASE BUCKET WHEN '11AM' THEN 1 ELSE 0 END) as 11AM,
        sum(CASE BUCKET WHEN '12PM' THEN 1 ELSE 0 END) as 12PM,
        sum(CASE BUCKET WHEN '01PM' THEN 1 ELSE 0 END) as 01PM,
        sum(CASE BUCKET WHEN '02PM' THEN 1 ELSE 0 END) as 02PM,
        sum(CASE BUCKET WHEN '03PM' THEN 1 ELSE 0 END) as 03PM,
        sum(CASE BUCKET WHEN '04PM' THEN 1 ELSE 0 END) as 04PM,
        sum(CASE BUCKET WHEN '05PM' THEN 1 ELSE 0 END) as 05PM,
        sum(CASE BUCKET WHEN '06PM' THEN 1 ELSE 0 END) as 06PM,
        sum(CASE BUCKET WHEN '07PM' THEN 1 ELSE 0 END) as 07PM,
        sum(CASE BUCKET WHEN '08PM' THEN 1 ELSE 0 END) as 08PM,
        sum(CASE BUCKET WHEN '09PM' THEN 1 ELSE 0 END) as 09PM,
        sum(CASE BUCKET WHEN '10PM' THEN 1 ELSE 0 END) as 10PM,
        sum(CASE BUCKET WHEN '11PM' THEN 1 ELSE 0 END) as 11PM
    FROM (
        SELECT
            EMPLOYEE_ID,
            date_format(ACTIVITY_DATE, 'hha') as BUCKET
        FROM
            STATEMENT_ACTIVITIES
        WHERE
            ACTIVITY_DATE BETWEEN
                current_timestamp() - interval '1 day'
                AND current_timestamp()
    )
    GROUP BY
        EMPLOYEE_ID
)
SELECT
    EMPLOYEE_NAME,
    TEAM,
    ROLE,
    ACTIVITIES.*
FROM
    EMPLOYEES
        INNER JOIN
            ACTIVITIES ON
                EMPLOYEES.EMPLOYEE_ID = ACTIVITIES.EMPLOYEE_ID
"""

(
    spark
    .sql(query)
    # .count()
    # .sample(False, 1e-1)
    .orderBy('TEAM', desc('EMPLOYEE_ACTIVITIES'))
    .limit(5)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_NAME,TEAM,ROLE,EMPLOYEE_ID,EMPLOYEE_ACTIVITIES,12AM,01AM,02AM,03AM,04AM,05AM,06AM,07AM,08AM,09AM,10AM,11AM,12PM,01PM,02PM,03PM,04PM,05PM,06PM,07PM,08PM,09PM,10PM,11PM
0,OnBase Processing Service,,Service Account,31377661,514,0,0,0,0,0,0,0,0,0,13,135,122,107,78,57,2,0,0,0,0,0,0,0,0
1,Tim Kornoelje,,Research Analyst,8404677,382,0,0,0,0,0,0,0,0,0,0,152,225,5,0,0,0,0,0,0,0,0,0,0,0
2,Jeanette Grassmid,,Project Audit,12611472,63,0,0,0,0,0,0,0,0,0,43,20,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Jessica Rinehart,,Statement Admin,8398642,43,0,0,0,0,0,0,0,0,0,36,2,2,3,0,0,0,0,0,0,0,0,0,0,0
4,Becky Williams,,WNC Auditor,26528878,38,0,0,0,0,0,0,0,0,0,3,7,10,8,7,3,0,0,0,0,0,0,0,0,0


In [157]:
query = "SELECT date_format(to_timestamp('2022-06-08 09:57:35'), 'hha') as formatted"

spark.sql(query).show()

+---------+
|formatted|
+---------+
|     09AM|
+---------+



In [76]:
query = """
WITH
EMAILS AS (
    SELECT
        REFERENCE_ID,
        count(EMAIL_DATE) as EMAILS_RECEIVED
    FROM
        STATEMENT_EMAILS_RECIEVED
    WHERE
        EMAIL_DATE BETWEEN
            current_timestamp() - interval '2 day'
            AND current_timestamp()
    GROUP BY
        REFERENCE_ID
)
SELECT *
FROM
    EMAILS
"""

(
    spark
    .sql(query)
    .limit(5)
    .toPandas()
)

Unnamed: 0,REFERENCE_ID,EMAILS_RECEIVED
0,1315271,1
1,1238795,1
2,1333508,1
3,1231046,1
4,1256281,1


In [85]:
query = """
    SELECT
        *
    FROM
        STATEMENT_ACTIVITIES
    WHERE
        ACTIVITY_DATE BETWEEN
            current_timestamp() - interval '3 day'
            AND current_timestamp()
        AND EXISTS (
            SELECT 1
            FROM
                STATEMENT_EMAILS_RECIEVED
            WHERE
                STATEMENT_ACTIVITIES.REFERENCE_ID = STATEMENT_EMAILS_RECIEVED.REFERENCE_ID
                AND EMAIL_DATE BETWEEN
                    current_timestamp() - interval '3 day'
                    AND current_timestamp()
        )
"""

spark.sql(query).groupBy('ACTIVITY').count().show()

+--------+-----+
|ACTIVITY|count|
+--------+-----+
|    NOTE|   48|
|    CALL|   10|
|    null|  378|
|RESPONSE|    9|
|   EMAIL|   20|
+--------+-----+



In [105]:
%%time

query = """
WITH
EMAILS AS (
    SELECT
        REFERENCE_ID,
        count(*) as EMAILS_RECEIVED
    FROM
        STATEMENT_EMAILS_RECIEVED
    WHERE
        EMAIL_DATE BETWEEN
            current_timestamp() - interval '2 day'
            AND current_timestamp()
    GROUP BY
        REFERENCE_ID
)
SELECT DISTINCT
    EMPLOYEE_ID,
    EMAILS_RECEIVED
FROM
    STATEMENT_ACTIVITIES
        INNER JOIN
            EMAILS ON
                STATEMENT_ACTIVITIES.REFERENCE_ID = EMAILS.REFERENCE_ID
"""

spark.sql(query).count()

CPU times: user 712 µs, sys: 179 µs, total: 891 µs
Wall time: 1.33 s


52

In [125]:
%%time
from pyspark.sql.functions import desc

# use activities for both emails and statements received
query = """
WITH
ACTIVITIES AS (
    SELECT
        *,
        row_number()
            OVER (
                PARTITION BY
                    EMPLOYEE_ID,
                    REFERENCE_ID
                ORDER BY
                    ACTIVITY_DATE DESC
            ) AS ACTIVITY_RANK
    FROM
        STATEMENT_ACTIVITIES
    WHERE
        ACTIVITY_DATE BETWEEN
            current_timestamp() - interval '2 day'
            AND current_timestamp()
)
SELECT
    EMPLOYEE_ID,
    count(EMAIL_DATE) as EMAILS_RECEIVED
FROM
    ACTIVITIES
        INNER JOIN
            STATEMENT_EMAILS_RECIEVED ON
                ACTIVITIES.REFERENCE_ID = STATEMENT_EMAILS_RECIEVED.REFERENCE_ID
                AND ACTIVITY_RANK = 1
                AND EMAIL_DATE BETWEEN
                    current_timestamp() - interval '2 day'
                    AND current_timestamp()
GROUP BY
    EMPLOYEE_ID
"""

# spark.sql(query).groupBy('EMPLOYEE_ID').count().orderBy(desc('count')).show()

# spark.sql(query).where('EMPLOYEE_ID = 31377661').orderBy('ACTIVITY_DATE').toPandas()

spark.sql(query).toPandas()

CPU times: user 8.87 ms, sys: 2.82 ms, total: 11.7 ms
Wall time: 1.56 s


Unnamed: 0,EMPLOYEE_ID,EMAILS_RECEIVED
0,31377661,4
1,32237633,2
2,21987402,5
3,19616967,1
4,15699251,1


In [167]:
query = """
WITH
ACTIVITIES AS (
    SELECT
        *,
        row_number()
            OVER (
                PARTITION BY
                    EMPLOYEE_ID,
                    REFERENCE_ID
                ORDER BY
                    ACTIVITY_DATE DESC
            ) AS ACTIVITY_RANK
    FROM
        STATEMENT_ACTIVITIES
    WHERE
        ACTIVITY_DATE BETWEEN
            current_timestamp() - interval '2 day'
            AND current_timestamp()
)
SELECT
    EMPLOYEES.EMPLOYEE_NAME,
    EMPLOYEES.EMPLOYEE_ID,
    count(STATEMENT_DATE) as STATEMENTS_RECEIVED
FROM
    EMPLOYEES
        INNER JOIN
            ACTIVITIES ON
                EMPLOYEES.EMPLOYEE_ID = ACTIVITIES.EMPLOYEE_ID
        INNER JOIN
            STATEMENTS_RECIEVED ON
                ACTIVITIES.REFERENCE_ID = STATEMENTS_RECIEVED.REFERENCE_ID
                AND ACTIVITY_RANK = 1
                AND STATEMENT_DATE BETWEEN
                    current_timestamp() - interval '2 day'
                    AND current_timestamp()
GROUP BY
    EMPLOYEES.EMPLOYEE_NAME,
    EMPLOYEES.EMPLOYEE_ID
"""

spark.sql(query).orderBy(desc('STATEMENTS_RECEIVED')).limit(5).toPandas()

Unnamed: 0,EMPLOYEE_NAME,EMPLOYEE_ID,STATEMENTS_RECEIVED
0,OnBase Processing Service,31377661,1132
1,Beth Herriman,32237633,77
2,Nathan Nagelkerk,21987402,24
3,Joe Vig,19616967,11
4,Briana Louck,39716530,7


In [157]:
spark.read.table('STATEMENTS_RECIEVED').printSchema()

root
 |-- REFERENCE_ID: integer (nullable = true)
 |-- EMAIL_ID: long (nullable = true)
 |-- STATEMENT_DATE: timestamp (nullable = true)



In [134]:
from pyspark.sql.functions import approx_count_distinct

(
    spark
    .read
    .table('STATEMENT_ACTIVITIES')
    .groupBy('REFERENCE_ID')
    .agg(approx_count_distinct('EMPLOYEE_ID').alias('count'))
    .where('count > 1')
    .orderBy(desc('count'))
    .show(5)
)

+------------+-----+
|REFERENCE_ID|count|
+------------+-----+
|     1203246|   21|
|     1251522|   18|
|     1203151|   18|
|     1255895|   17|
|     1280624|   16|
+------------+-----+
only showing top 5 rows



In [133]:
# spark.read.table('STATEMENT_ACTIVITIES').where('REFERENCE_ID = 1049740').toPandas()

# query = """
# SELECT
#     EMPLOYEE_NAME,
#     STATEMENT_ACTIVITIES.*
# FROM
#     EMPLOYEES
#         INNER JOIN
#             STATEMENT_ACTIVITIES ON
#                 EMPLOYEES.EMPLOYEE_ID = STATEMENT_ACTIVITIES.EMPLOYEE_ID
#                 AND REFERENCE_ID = 1049740
# ORDER BY
#     ACTIVITY_DATE
# """

# spark.sql(query).toPandas()

In [96]:
query = """
SELECT
    *
FROM
    STATEMENT_ACTIVITIES
WHERE
    ACTIVITY_DATE BETWEEN
        current_timestamp() - interval '5 day'
            AND current_timestamp()
    AND REFERENCE_ID = 1239332
"""

spark.sql(query).toPandas()

Unnamed: 0,ACTIVITY_ID,REFERENCE_ID,CONTACT_ID,REQUEST_ID,EMPLOYEE_ID,ACTIVITY_DATE,ACTIVITY,JOB_NUMBER,JOB_NAME,VENDOR_NAME,REFERENCE_NUMBER,OUTCOME
0,48464036,1239332,,46684790,32237633,2022-06-08 07:11:47,,3285,Community Health Systems - 3285,RTI SURGICAL INC,1239332,


In [60]:
query = """
SELECT
    *
FROM
    STATEMENT_REQUESTS
LIMIT 5
"""

spark.sql(query).toPandas()

Unnamed: 0,REQUEST_ID,EMPLOYEE_ID,REQUEST_DATE,REQUEST_STATUS,REQUEST_METHOD,REQUEST_TYPE,LAST_ACTIVITY_DATE,LAST_RECEIVED_DATE,WILL_NOT_COMPLY
0,25422373,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,False
1,25422392,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,,
2,25422433,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,
3,25422445,8398642,2020-05-01,Fully Received,MassEmail,,2020-05-01,2020-05-04,
4,25422454,8398642,2020-05-01,Partial Receipt,MassEmail,,2020-05-01,2020-05-04,


In [63]:
# query = """
# SELECT
#     EMPLOYEE_ID,
#     count(*) as REQUESTED_EMAILS
# FROM
#     STATEMENT_REQUESTS
# WHERE
#     REQUEST_DATE BETWEEN
#         current_timestamp() - interval '2 day'
#         AND current_timestamp()
# GROUP BY
#     EMPLOYEE_ID
# """

# spark.sql(query).toPandas()

In [172]:
query = """
WITH
EMAILS AS (
    SELECT
        EMPLOYEE_ID,
        count(*) as EMAILS_SENT
    FROM
        STATEMENT_ACTIVITIES
    WHERE
        ACTIVITY = 'EMAIL'
        AND ACTIVITY_DATE BETWEEN
            current_timestamp() - interval '1 day'
            AND current_timestamp()
    GROUP BY
        EMPLOYEE_ID
)
SELECT
    EMPLOYEE_NAME,
    EMAILS.*
FROM
    EMPLOYEES
        INNER JOIN
            EMAILS ON
                EMPLOYEES.EMPLOYEE_ID = EMAILS.EMPLOYEE_ID
ORDER BY
    EMAILS_SENT DESC,
    EMPLOYEE_NAME
"""

(
    spark
    .sql(query)
    .toPandas()
)

Unnamed: 0,EMPLOYEE_NAME,EMPLOYEE_ID,EMAILS_SENT
0,Jessica Caggiano,46530851,38
1,Becky Williams,26528878,30
2,Jessica Brown,47284022,28
3,Jamie Spurlock,29280988,27
4,Leah Kuester,15699263,11
5,Chenoa Marklevitz,22196321,9
6,Annie Lewis,33107513,5
7,Jealisa Boyd,16808936,3
8,Steven Williams,26118045,3
9,Ashley Tran,47514957,2


In [178]:
query = """
SELECT
    EMPLOYEE_ID,
    count(DISTINCT REFERENCE_ID) as REFERENCES
FROM
    STATEMENT_ACTIVITIES
WHERE
    ACTIVITY_DATE BETWEEN
        current_timestamp() - interval '1 day'
        AND current_timestamp()
GROUP BY
    EMPLOYEE_ID
"""

spark.sql(query).orderBy(desc('REFERENCES')).limit(5).toPandas()

Unnamed: 0,EMPLOYEE_ID,REFERENCES
0,8404677,382
1,31377661,359
2,46530851,58
3,12611472,46
4,8406307,43


In [173]:
# query = """
# SELECT *
# FROM
#     STATEMENT_ACTIVITIES
# WHERE
#     EXISTS (
#         SELECT 1
#         FROM
#             STATEMENT_REQUESTS
#         WHERE
#             STATEMENT_ACTIVITIES.REQUEST_ID = STATEMENT_REQUESTS.REQUEST_ID
#     )
#     AND ACTIVITY_DATE BETWEEN
#         current_timestamp() - interval '1 day'
#             AND current_timestamp()
# """

# (
#     spark
#     .sql(query)
#     .limit(5)
#     .toPandas()
# )

In [174]:
# from pyspark.sql.functions import col

# (
#     spark
#     .read
#     .parquet('/tmp/statements/bronze/StatementRequestActivityRecords')
#     .where(col('CreatedDate').between('2022-06-08 07:00:00', '2022-06-10 07:00:00'))
#     .where(col('ActivityType') == 'Emailed Vendor')
#     .join(
#         spark
#         .read
#         .parquet('/tmp/statements/bronze/Employees')
#         .withColumn('ActivityUser', col('FullName')),
#         on='ActivityUser',
#         how='left_semi'
#     )
#     .groupBy('ActivityUser')
#     .count()
#     .toPandas()
# )

In [168]:
# (
#     spark
#     .read
#     .parquet('/tmp/statements/bronze/Employees')
#     .limit(5)
#     .toPandas()
# )

In [75]:
# spark.sql('SELECT * FROM EMPLOYEES LIMIT 5').toPandas()

In [None]:
spark.stop()