In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[*]')
    .appName('PySparkShell')
    .getOrCreate()
)

In [2]:
import sys
import os

sys.path.append('/dbfs/cdr/library')

from pandas import set_option
set_option('display.max.columns', None)

# TODO: remove for Databricks
from dbutils import DBUtils, FileInfo
dbutils = DBUtils()

In [3]:
from typing import Dict, List, Optional

# from sparkFunctions import create_tmp_parquet

# adds file system prefix /dbfs for local and dbfs: for databricks
def resolve(path: str) -> Optional[str]:
    try:
        prefix = dbutils.fs.ls('/')[0].path[:5]
        return prefix + path
    except Exception as error:
        print(error)

In [4]:
FOLDER = '/automation'

ZOOM_FOLDER = os.path.join(FOLDER, 'ZoomData')
BASE_FOLDER = os.path.join(FOLDER, 'OnBase')

In [31]:
path = os.path.join(ZOOM_FOLDER, 'formattedAccountCallLogs.parquet')
calls = spark.read.parquet(resolve(path))

In [7]:
path = os.path.join(BASE_FOLDER, 'rm_DVStatementRequestActivityRecords.parquet')
activities = spark.read.parquet(resolve(path))

In [8]:
path = os.path.join(BASE_FOLDER, 'rm_DVStatementRequests.parquet')
requests = spark.read.parquet(resolve(path))

In [32]:
path = os.path.join(BASE_FOLDER, 'rm_DVVendorContacts.parquet')
vendors = spark.read.parquet(resolve(path))

### Transform

In [33]:
from pyspark.sql.functions import regexp_extract, regexp_replace, trim

# outgoing calls begin with +1
# also use 1 for 1-800 numbers
# ones not found in VENDOR table

# ^ is anchor at start
# \+ escapes literal + sign in regular expression
# same with *, which is generally a wild card
pattern = r'^(\+1|\*)*([0-9]+)'

calls = (
    calls
    # extract second group using index=2
    .withColumn('CALLED_NUMBER', regexp_extract('callee_number', pattern, 2))
    .withColumn('RECEIVED_NUMBER', regexp_extract('caller_number', pattern, 2))
)

In [62]:
vendors = (
    vendors
    .withColumn('PHONE_NUMBER', regexp_replace('Phone', '[^0-9]', ''))
    # take up to first 10 numbers
    .withColumn('PHONE_NUMBER', regexp_extract('PHONE_NUMBER', r'^(\d{0,10})', 1))
)

In [63]:
# register the tables
calls.createOrReplaceTempView(name='CALLS')
vendors.createOrReplaceTempView(name='VENDORS')
activities.createOrReplaceTempView(name='ACTIVITIES')
requests.createOrReplaceTempView(name='REQUESTS')

In [64]:
query = """
SELECT
    VENDORS.*
FROM
    VENDORS
        INNER JOIN
            CALLS ON
                VENDORS.PHONE_NUMBER = CALLS.CALLED_NUMBER
                AND trim(CALLS.DIRECTION) = 'outbound'
"""

outbound = spark.sql(query)
outbound.count()

551487

In [65]:
query = """
SELECT
    VENDORS.*
FROM
    VENDORS
        INNER JOIN
            CALLS ON
                VENDORS.PHONE_NUMBER = CALLS.RECEIVED_NUMBER
                AND trim(CALLS.DIRECTION) = 'inbound'
"""

inbound = spark.sql(query)
inbound.count()

11819

In [None]:
spark.stop()