In [0]:
from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col, from_utc_timestamp, lit, trim, udf, when
from typing import Optional

import phonenumbers


# returns phone number in format (987) 654-3210
@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


# retains value when value matches pattern
def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)


def toTimezone(dataFrame: DataFrame, tz: str) -> DataFrame:
    for colName, dtype in dataFrame.dtypes:
        if dtype == 'timestamp':
            dataFrame = dataFrame.withColumn(colName, from_utc_timestamp(colName, tz))
    
    return dataFrame

In [0]:
from pyspark.sql.types import ArrayType, StructType
from typing import List

# flattens nested schema
def flatten(schema: StructType, prefix: str=None) -> List[str]:
    fields = list()

    for field in schema.fields:
        # check for Struct in each column
        name = prefix + '.' + field.name if prefix else field.name

        dtype = field.dataType

        # check if column is Array
        if isinstance(dtype, ArrayType):
            dtype = dtype.elementType

        # check if column is Struct
        if isinstance(dtype, StructType):
            # use function recursion
            fields = fields + flatten(dtype, prefix=name)
        else:
            fields.append(name)

    return fields

In [0]:
from pyspark.sql import Column, DataFrame
from pyspark.sql.functions import col
from typing import List


def concatenateColumns(columns: List[str], split: str='.', join: str='_') -> List[Column]:
    aliases = list()

    for column in columns:
        tokens = column.split(split)
        if len(tokens) > 2:
            alias = join.join(tokens[1:])
        else:
            alias = tokens[-1]

        aliases.append(col(column).alias(alias))
    
    return aliases

In [0]:
schema = """
next_page_token string,
page_size int,
total_records int,
from date,
to date,
call_logs array<
    struct<
        id: string,
        user_id: string,
        call_type: string,
        caller_number: string,
        caller_number_type: int,
        caller_name: string,
        caller_number_source: string,
        callee_number: string,
        caller_location: string,
        callee_number_type: int,
        callee_number_source: string,
        callee_location: string,
        callee_name: string,
        direction: string,
        duration: int,
        result: string,
        waiting_time: int,
        date_time: timestamp,
        path: string,
        has_recording: boolean,
        charge: string,
        voice_mail_id: string,
        has_voicemail: boolean,
        rate: string,
        call_id: string,
        owner: struct<
            type: string,
            id: string,
            name: string,
            extension_number: string
        >,
        caller_did_number: string,
        caller_country_code: string,
        caller_country_iso_code: string,
        callee_did_number: string,
        callee_country_code: string,
        callee_country_iso_code: string,
        answer_start_time: timestamp,
        call_end_time: timestamp
    >
>
"""

In [0]:
from pyspark.sql.functions import explode

path = '/mnt/smlake/bronze/ZoomData/AccountCallLogs.json'

table = (
    spark
    .read
    .schema(schema)
    .json(path)
    .select(explode('call_logs').alias('call_logs'))
)

columns = flatten(table.schema)
columns = concatenateColumns(columns)

table = toTimezone(table.select(*columns), tz='America/New_York')

In [0]:
from pyspark.sql.functions import regexp_replace

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210

path = '/tmp/statements/bronze/AccountCallLogs'

(
    table
    .withColumn('caller_number', parsePhoneNumber('caller_number'))
    .withColumn('caller_number', matchPattern('caller_number', pattern))

    .withColumn('callee_number', parsePhoneNumber('callee_number'))
    .withColumn('callee_number', matchPattern('callee_number', pattern))

    .withColumn('caller_location', regexp_replace('caller_location', r'\s+', ' '))
    .withColumn('charge', regexp_replace('charge', r'[^0-9\.]', '').cast('double'))
    .withColumn('rate', regexp_replace('rate', r'[^0-9\.]', '').cast('double'))
    .write
    .mode('overwrite')
    .parquet(path)
)

display(spark.read.parquet(path))

id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
9b2d48a3-f806-4189-8fd3-abc93f470445,nsUm-nz6SRGL9lk9uWIuJw,voip,(248) 894-3135,2,Bush Robert,internal,,,1,,,Briana Louck,inbound,0,No Answer,,2022-05-24T22:18:52.000+0000,pstn,False,,,False,,7101492763386200677,user,nsUm-nz6SRGL9lk9uWIuJw,Briana Louck,976,,1.0,US,16163302976.0,1.0,US,,
9095638e-9554-4998-816a-9958ad536014,RZUHmMmLQk6YuJrW1ZJoAg,voip,(913) 730-6112,2,,internal,,Kansas City KS,1,,,Ashley Tran,inbound,0,No Answer,,2022-05-24T20:12:20.000+0000,pstn,False,,,False,,7101460160289511193,user,RZUHmMmLQk6YuJrW1ZJoAg,Ashley Tran,997,,1.0,US,16163151997.0,1.0,US,,
9cb9174d-d79c-422e-890b-3b73088d9b0e,YHX2Gnk_Qma0aolduXnpYg,voip,(470) 333-1603,2,Debt Relief,internal,,,1,,,Jonathan Kenyon,inbound,0,No Answer,,2022-05-24T19:57:53.000+0000,pstn,False,,,False,,7101456436545510556,user,YHX2Gnk_Qma0aolduXnpYg,Jonathan Kenyon,263,,1.0,US,16165888263.0,1.0,US,,
f75be38d-9d06-404f-a8bd-4cb2962eac42,,voip,(760) 227-8347,2,,internal,,Desert Ctr CA,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-24T19:40:57.000+0000,autoReceptionist,False,,,False,,7101452072868301930,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1.0,US,16162576300.0,1.0,US,,
5e59c0b7-b826-464b-8287-c1b60bf116c0,,voip,(949) 735-3093,2,MILLER JOHN,internal,,,1,,,California Auto Attendant,inbound,0,No Answer,,2022-05-24T19:23:03.000+0000,autoReceptionist,False,,,False,,7101447460063866147,autoReceptionist,My6kj6OiSnC4_hrAeRcSqg,California Auto Attendant,484,,1.0,US,19497256491.0,1.0,US,,
5595155c-687b-4393-b49b-619c8a6af960,lEVgVQJnRxeWdxG8Ru3ufA,voip,(616) 253-2063,2,6162532063,internal,,,1,,,Alyssa Neiser,inbound,0,No Answer,,2022-05-24T18:57:37.000+0000,pstn,False,,,False,,7101440905953253393,user,lEVgVQJnRxeWdxG8Ru3ufA,Alyssa Neiser,479,,1.0,US,16167291479.0,1.0,US,,2022-05-24T18:57:37.000+0000
cd215f99-daed-4f49-8f7f-796cc3b9ec98,,voip,(949) 627-9197,2,9496279197,internal,,,1,,,California Auto Attendant,inbound,0,No Answer,,2022-05-24T18:54:57.000+0000,autoReceptionist,False,,,False,,7101440218758552239,autoReceptionist,My6kj6OiSnC4_hrAeRcSqg,California Auto Attendant,484,,1.0,US,19497250612.0,1.0,US,,
6753f766-19c0-4f14-8054-71e4e79f44fe,o_7qAnE9TjaFGMbVdyGPmQ,voip,(216) 259-1574,2,Nat Tax,internal,,,1,,,Derek Myers,inbound,0,No Answer,,2022-05-24T18:37:32.000+0000,pstn,False,,,False,,7101435730515543689,user,o_7qAnE9TjaFGMbVdyGPmQ,Derek Myers,730,,1.0,US,16163272730.0,1.0,US,,
5572daad-3284-41cd-94fd-5031ee3b69b6,ND-3BmKYQLKzvKUecfAHZQ,voip,(949) 598-0300,2,Inceptus Med Ll,internal,,,1,,,Toni Engle,inbound,0,No Answer,,2022-05-24T18:37:30.000+0000,pstn,False,,,False,,7101435717632761578,user,ND-3BmKYQLKzvKUecfAHZQ,Toni Engle,371,,1.0,US,16163712443.0,1.0,US,,2022-05-24T18:37:30.000+0000
b7db85b6-45dc-475c-a606-97539591f122,Ijx28YcITViF9JxxxOwOLg,voip,(470) 339-7193,2,Debt Relief,internal,,,1,,,Alan Sibley,inbound,0,No Answer,,2022-05-24T18:23:46.000+0000,pstn,False,,,False,,7101432178570263372,user,Ijx28YcITViF9JxxxOwOLg,Alan Sibley,265,,1.0,US,16165888265.0,1.0,US,,
