In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = SparkSession.builder.getOrCreate()

spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
from pyspark.sql import Column
from pyspark.sql.functions import col, udf, when
from typing import Optional

import phonenumbers


# returns phone number in format (987) 654-3210
@udf(returnType='string')
def parsePhoneNumber(number: str, region: str='US') -> Optional[str]:
    try:
        phoneNumber = phonenumbers.parse(number, region)
        return phonenumbers.format_number(phoneNumber, phonenumbers.PhoneNumberFormat.NATIONAL)
    except:
        return None


# retains value when value matches pattern
def matchPattern(colName: str, pattern: str) -> Column:
    column = col(colName) 
    return when(column.rlike(pattern), column)

In [3]:
schema = """
next_page_token string,
page_size int,
total_records int,
from date,
to date,
call_logs array<
    struct<
        id: string,
        user_id: string,
        call_type: string,
        caller_number: string,
        caller_number_type: int,
        caller_name: string,
        caller_number_source: string,
        callee_number: string,
        caller_location: string,
        callee_number_type: int,
        callee_number_source: string,
        callee_location: string,
        callee_name: string,
        direction: string,
        duration: int,
        result: string,
        waiting_time: int,
        date_time: timestamp,
        path: string,
        has_recording: boolean,
        charge: string,
        voice_mail_id: string,
        has_voicemail: boolean,
        rate: string,
        call_id: string,
        owner: struct<
            type: string,
            id: string,
            name: string,
            extension_number: string
        >,
        caller_did_number: string,
        caller_country_code: string,
        caller_country_iso_code: string,
        callee_did_number: string,
        callee_country_code: string,
        callee_country_iso_code: string,
        answer_start_time: timestamp,
        call_end_time: timestamp
    >
>
"""

In [16]:
selectColumns = [
    'caller_name',
    'caller_number',
    'callee_name',
    'callee_number',
    'direction',
    'duration',
    'date_time',
    'result',
    'call_id',
    # 'owner.*',
    # 'owner',
    'call_type',
    'caller_number_type',
    'callee_number_type',
    'path',
    'has_recording',
    'has_voicemail'
]

In [4]:
from pyspark.sql.types import ArrayType, StructType
from typing import List

def flatten(schema: StructType, prefix: str=None) -> List[str]:
    fields = list()

    for field in schema.fields:
        # check for Struct in each column
        name = prefix + '.' + field.name if prefix else field.name

        dtype = field.dataType

        # check if column is Array
        if isinstance(dtype, ArrayType):
            dtype = dtype.elementType

        # check if column is Struct
        if isinstance(dtype, StructType):
            # use nested dot `.` property
            # use function recursion
            fields += flatten(dtype, prefix=name)
        else:
            fields.append(name)

    return fields

In [5]:
from pyspark.sql.functions import explode

table = (
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        schema=schema,
        timestampFormat='yyyy-MM-ddTHH:mm:ssZ',
        mode='FAILFAST'
    )
    .withColumn('call_logs', explode('call_logs'))
)

columns = flatten(table.schema)

table.select(*columns).limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,type,id.1,name,extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 23:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 23:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 23:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 22:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


In [7]:
# [column for column in table.select(*columns).columns if column in selectColumns]

# [column for column in table.select(*columns).columns if column not in selectColumns]

In [6]:
from pyspark.sql.functions import col

aliases = list()

for column in columns:
    tokens = column.split('.')
    if len(tokens) > 2:
        alias = '_'.join(tokens[1:])
    else:
        alias = tokens[-1]
    
    aliases.append(col(column).alias(alias))
    
table.select(*aliases).limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 23:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 23:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 23:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 22:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


In [7]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import from_utc_timestamp, lit, trim, when

def truncateStrings(dataFrame: DataFrame) -> DataFrame:
    for colName, dtype in dataFrame.dtypes:
        if dtype == 'string':
            column = trim(colName)
            case = when(column == '', lit(None).cast('string')).otherwise(column)
            dataFrame = dataFrame.withColumn(colName, case)

    return dataFrame


def toTimezone(dataFrame: DataFrame, tz: str) -> DataFrame:
    for colName, dtype in dataFrame.dtypes:
        if dtype == 'timestamp':
            dataFrame = dataFrame.withColumn(colName, from_utc_timestamp(colName, tz))
    
    return dataFrame

In [8]:
%%time
from pyspark.sql.functions import regexp_replace

table = (
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        schema=schema,
        timestampFormat='yyyy-MM-ddTHH:mm:ssZ',
        mode='FAILFAST'
    )
    .withColumn('call_logs', explode('call_logs'))
    .select(*aliases)
)

table = truncateStrings(table)
# table = toTimezone(table, 'America/New_York')

pattern = r'\(\d+\) \d+-\d+' # (987) 654-3210


(
    table
    .withColumn('caller_number', parsePhoneNumber('caller_number'))
    .withColumn('caller_number', matchPattern('caller_number', pattern))

    .withColumn('callee_number', parsePhoneNumber('callee_number'))
    .withColumn('callee_number', matchPattern('callee_number', pattern))

    .withColumn('caller_location', regexp_replace('caller_location', r'\s+', ' '))
    .withColumn('charge', regexp_replace('charge', r'[^0-9\.]', '').cast('double'))
    .withColumn('rate', regexp_replace('rate', r'[^0-9\.]', '').cast('double'))
    .write
    .mode('overwrite')
    .parquet('/tmp/phone-calls/AccountCallLogs')
)


table = spark.read.parquet('/tmp/phone-calls/AccountCallLogs')
table.limit(5).toPandas()

CPU times: user 151 ms, sys: 25.6 ms, total: 177 ms
Wall time: 6.92 s


Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,,1,Nicole Thompson,,(828) 737-7552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 23:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 23:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,(616) 208-1843,2,Amerisave,internal,,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,(516) 565-8083,2,,internal,,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 23:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,(231) 265-6202,2,2312656202,internal,,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 23:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,(702) 228-0222,2,RR Partners,internal,,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 22:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


In [13]:
spark.conf.set('spark.sql.session.timeZone', 'America/New_York')
table.limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 15:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 15:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 15:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 15:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 15:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 14:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


In [11]:
# inbound -> caller; clients call us
# outbound -> callee

table.where(col('direction') == 'outbound').limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 19:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255,1,US,18287377552,1,US,NaT,2022-05-11 19:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,44c99581-eb3c-49cf-8778-188f64b533a4,GkO02-yQQ9yqSCG1SasY4w,pstn,353,1,Carla Roark,,18004173747,,2,internal,United States,,outbound,12,Call connected,,2022-05-11 18:09:01,pstn,False,,,False,,7096604274560694085,user,GkO02-yQQ9yqSCG1SasY4w,Carla Roark,353,16162576353,1,US,18004173747,1,US,NaT,2022-05-11 18:09:16
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2fefde76-13da-4e6d-b2ce-1ff0447a93b5,GkO02-yQQ9yqSCG1SasY4w,pstn,353,1,Carla Roark,,16022880031,,2,internal,"Phoenix, AZ",,outbound,102,Call connected,,2022-05-11 18:05:58,pstn,False,,,False,,7096603488581639478,user,GkO02-yQQ9yqSCG1SasY4w,Carla Roark,353,16162576353,1,US,16022880031,1,US,NaT,2022-05-11 18:08:18
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,121c3da2-fef4-4f9e-a37a-03435197a3b5,GkO02-yQQ9yqSCG1SasY4w,pstn,353,1,Carla Roark,,16022880060,,2,internal,"Phoenix, AZ",,outbound,8,Call connected,,2022-05-11 18:05:20,pstn,False,,,False,,7096603325372865252,user,GkO02-yQQ9yqSCG1SasY4w,Carla Roark,353,16162576353,1,US,16022880060,1,US,NaT,2022-05-11 18:05:40
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,0be7574d-82d7-40b6-bf86-b8aa4268d9fe,-6jgSVmuQxmukeufp3KDbA,pstn,239,1,Cierra Kilpatrick,,18004311055,,2,internal,United States,,outbound,8,Call connected,,2022-05-11 18:04:48,pstn,False,,,False,,7096603187933966035,user,-6jgSVmuQxmukeufp3KDbA,Cierra Kilpatrick,239,16162572039,1,US,18004311055,1,US,NaT,2022-05-11 18:04:58


In [127]:
# Main Auto Receptionist

(
    table
    .where(col('direction') == 'inbound')
    .groupBy('caller_name')
    # .groupBy('caller_number')
    # .groupBy('callee_name')
    .count()
    .orderBy(desc('count'))
    .show(20, truncate=False)
)

+---------------+-----+
|caller_name    |count|
+---------------+-----+
|null           |76   |
|Riverside Metho|14   |
|LeAnne Hoekstra|7    |
|Oracle America |6    |
|Jessica Sheldon|6    |
|Amerisave      |5    |
|Christine Koehl|5    |
|Matechresources|4    |
|Ashley Micallef|4    |
|Amy Mccurry    |4    |
|Atrium Health  |3    |
|World Wide Unos|3    |
|Amerisource Hlt|3    |
|Priyanka Agarwa|3    |
|Portfoliorecov |3    |
|Quva Pharma Inc|3    |
|Cision         |3    |
|ROBERT HALF    |3    |
|Weltman Weinber|3    |
|Doubletree Hote|3    |
+---------------+-----+
only showing top 20 rows



In [148]:
# SpendMend: (616) 257-6300
(
    table
    .where(col('direction') == 'inbound')
    .where(col('caller_name').rlike('Atrium'))
    .toPandas()
)

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,21f58b3d-a91f-4c0e-a972-af7b5deb4637,M0-l8UhdTfKRy1wFngyzAA,voip,17044466161,2,Atrium Health,internal,308,,1,,,Garett Zorb,inbound,5,Call connected,,2022-05-11 14:28:24,pstn,False,,,False,,7096547422070191160,user,M0-l8UhdTfKRy1wFngyzAA,Garett Zorb,308,,1,US,16162576393,1,US,NaT,NaT
1,7ee6QDbVf6qsru4hbU6EaeKpiCbXSm283E4,300,2530,2022-05-09,2022-05-12,cb5290e6-c8b6-4ecd-a8d1-f48fcae51e20,M0-l8UhdTfKRy1wFngyzAA,voip,17044466161,2,Atrium Health,internal,308,,1,,,Garett Zorb,inbound,13,Call connected,,2022-05-11 09:38:57,pstn,False,,,False,,7096472827087601902,user,M0-l8UhdTfKRy1wFngyzAA,Garett Zorb,308,,1,US,16162576393,1,US,NaT,NaT
2,8Mx2nBLlk8mE3GC0quRLfrWaiWW8Tl4eGn7,300,2530,2022-05-09,2022-05-12,f5c958b1-14ac-47c0-90d0-78fca8d198c4,M0-l8UhdTfKRy1wFngyzAA,voip,17044466161,2,Atrium Health,internal,308,,1,,,Garett Zorb,inbound,6,Call connected,,2022-05-10 09:42:49,pstn,False,,,False,,7096102742631175025,user,M0-l8UhdTfKRy1wFngyzAA,Garett Zorb,308,,1,US,16162576393,1,US,NaT,NaT


In [140]:
# all incomming calls voip

(
    table
    .where(col('direction') == 'inbound')
    .groupBy('call_type')
    .count()
    .show()
)

+---------+-----+
|call_type|count|
+---------+-----+
|     voip|  329|
+---------+-----+



In [157]:
(
    spark
    .read
    .parquet('/tmp/phone-calls/VendorContacts')
    .where(col('CustomerName').rlike('Atrium'))
    .where(col('Phone').rlike('^704'))
    .limit(5)
    .toPandas()
)

Unnamed: 0,CustVendorGroupNo,CustVendorNo,ContactType,TypePreferredContact,LastName,FirstName,FullName,Email,Phone,Fax,Title,Note,ExternalID,BSAPVCID,CID,CustomerName,CVObjectID,CustObjectID,ObjectID,CreatedDate,CreatedBy,BSAPVendorObjectID
0,445231,1177863,Statement,0,,Karon Hauch,Karon Hauch,,7043653993,,,,5506557,,15375,Atrium Health,19462061,16164706,29597096,2020-12-10 16:16:11.103,SVC_WORKFLOW,29597096
1,445191,1215553,Statement,1,,,,,7044642448,7048640104.0,,,5506390,,15375,Atrium Health,19461347,16164706,29597103,2020-12-10 16:16:13.520,SVC_WORKFLOW,29597103
2,445208,1234905,Statement,0,,Johanna Sanders - Office Mgr.,Johanna Sanders - Office Mgr.,johanna@picnictableproductions.com,7048707306,,,,5505962,,15375,Atrium Health,19461623,16164706,29597104,2020-12-10 16:16:14.090,SVC_WORKFLOW,29597104
3,445204,261120,Statement,0,,,,,7045883713,,,,5506790,,15375,Atrium Health,19461548,16164706,29597105,2020-12-10 16:16:15.150,SVC_WORKFLOW,29597105
4,445217,1046360,Statement,0,,,,jswaringen@medusind.com,7049844637,,,,5506808,,15375,Atrium Health,19461740,16164706,29597106,2020-12-10 16:16:15.370,SVC_WORKFLOW,29597106


In [102]:
table.where('caller_name <=> owner_name').count()

253

In [None]:
table.where('')

In [96]:
from pyspark.sql.functions import length

(
    table
    # .withColumn('_length', length('caller_number'))
    .withColumn('_length', length('callee_number'))
    .groupBy('_length')
    .count()
    .orderBy(desc('count'))
    .show()
)

+-------+-----+
|_length|count|
+-------+-----+
|     12| 2170|
|      3|  349|
|     11|    6|
|     13|    4|
|     18|    1|
+-------+-----+



### Cardinality

In [None]:
[
    'call_type', # pstn, voip, international
    'caller_number_source', # null, international
    'caller_location', # null, ...
    'callee_number_source', # international, null
    'callee_location', # United States, null, ...
    'direction', # outbound, inbound
    'result', # Call connected, No Answer, Call Cancel, Rejected, Call failed
    'path', # pstn, autoReceptionist, extension, callQueue
    'owner_type', # user, autoReceptionist, callQueue
    'caller_country_code', # 1, ...
    'caller_country_iso_code', # US, null, GB, 001, CN, CR
    'callee_country_code', # 1, ...
    'callee_country_iso_code', # US, null, CA
]

In [94]:
from pyspark.sql.functions import desc

# table.where('answer_start_time is not null').select('answer_start_time').show()

# table.where('charge is not null').select('charge').show()

table.groupBy('callee_country_iso_code').count().orderBy(desc('count')).show()

+-----------------------+-----+
|callee_country_iso_code|count|
+-----------------------+-----+
|                     US| 2471|
|                   null|   47|
|                     CA|    6|
|                     IN|    2|
|                     PK|    1|
|                     CR|    1|
|                     VI|    1|
|                    001|    1|
+-----------------------+-----+



### Data Types

In [46]:
data = list()

for colName, dtype in table.select(*aliases).dtypes:
    if dtype != 'string':
        data.append((colName, dtype))
        
data

[('page_size', 'int'),
 ('total_records', 'int'),
 ('from', 'date'),
 ('to', 'date'),
 ('caller_number_type', 'bigint'),
 ('callee_number_type', 'bigint'),
 ('duration', 'bigint'),
 ('waiting_time', 'bigint'),
 ('date_time', 'timestamp'),
 ('has_recording', 'boolean'),
 ('has_voicemail', 'boolean'),
 ('owner_extension_number', 'bigint'),
 ('call_end_time', 'timestamp')]

In [47]:
(
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        primitivesAsString=True
    )
    .withColumn('call_logs', explode('call_logs'))
    .select(*aliases)
    .write
    .mode('overwrite')
    .parquet('/tmp/AccountCallLogs')
)

_table = spark.read.parquet('/tmp/AccountCallLogs')
_table.limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11T23:36:35Z,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,,2022-05-11T23:38:41Z
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11T23:28:25Z,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11T23:21:14Z,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11T23:00:02Z,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,,
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11T22:47:33Z,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,,


In [None]:
[
    ('page_size', 'int'),
    ('total_records', 'int'),
    ('from', 'date'),
    ('to', 'date'),
    ('caller_number_type', 'bigint'),
    ('callee_number_type', 'bigint'),
    ('duration', 'bigint'),
    ('waiting_time', 'bigint'),
    ('date_time', 'timestamp'),
    ('has_recording', 'boolean'),
    ('has_voicemail', 'boolean'),
    ('owner_extension_number', 'bigint'),
    ('call_end_time', 'timestamp')
]

In [49]:
bigint = [
    'caller_number_type',
    'callee_number_type',
    'duration',
    'waiting_time',
    
    'owner_extension_number'
]

_table.select(*bigint).limit(5).toPandas()

Unnamed: 0,caller_number_type,callee_number_type,duration,waiting_time,owner_extension_number
0,1,2,0,,255
1,2,1,0,,300
2,2,1,0,,300
3,2,1,0,,226
4,2,1,0,,208


In [50]:
_table.where(col('owner_extension_number').startswith('0')).count()

0

In [56]:
for column in bigint:
    # maximum = _table.selectExpr(f'max(cast({column} as bigint))').first()[0]
    maximum = _table.where(col(column).rlike('[^0-9]')).count()
    print(column, maximum)

caller_number_type 0
callee_number_type 0
duration 0
waiting_time 0
owner_extension_number 0


In [48]:
(
    _table
    .groupBy('has_recording', 'has_voicemail')
    .count()
    .show()
)

+-------------+-------------+-----+
|has_recording|has_voicemail|count|
+-------------+-------------+-----+
|        false|         true|    1|
|        false|        false| 2504|
|        false|         null|   25|
+-------------+-------------+-----+



In [2]:
spark.stop()