In [1]:
from pyspark.sql import SparkSession
from pandas import set_option

set_option('display.max.columns', None)

spark = SparkSession.builder.getOrCreate()

In [67]:
schema = """
next_page_token string,
page_size int,
total_records int,
from date,
to date,
call_logs array<
    struct<
        id: string,
        user_id: string,
        call_type: string,
        caller_number: string,
        caller_number_type: int,
        caller_name: string,
        caller_number_source: string,
        callee_number: string,
        caller_location: string,
        callee_number_type: int,
        callee_number_source: string,
        callee_location: string,
        callee_name: string,
        direction: string,
        duration: int,
        result: string,
        waiting_time: int,
        date_time: timestamp,
        path: string,
        has_recording: boolean,
        charge: string,
        voice_mail_id: string,
        has_voicemail: boolean,
        rate: string,
        call_id: string,
        owner: struct<
            type: string,
            id: string,
            name: string,
            extension_number: string
        >,
        caller_did_number: string,
        caller_country_code: string,
        caller_country_iso_code: string,
        callee_did_number: string,
        callee_country_code: string,
        callee_country_iso_code: string,
        answer_start_time: timestamp,
        call_end_time: timestamp
    >
>
"""

In [24]:
from pyspark.sql.types import ArrayType, StructType
from typing import List

def flatten(schema: StructType, prefix: str=None) -> List[str]:
    fields = list()

    for field in schema.fields:
        # check for Struct in each column
        name = prefix + '.' + field.name if prefix else field.name

        dtype = field.dataType

        # check if column is Array
        if isinstance(dtype, ArrayType):
            dtype = dtype.elementType

        # check if column is Struct
        if isinstance(dtype, StructType):
            # use nested dot `.` property
            # use function recursion
            fields += flatten(dtype, prefix=name)
        else:
            fields.append(name)

    return fields

In [58]:
from pyspark.sql.functions import explode

table = (
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        schema=schema,
        timestampFormat='yyyy-MM-ddTHH:mm:ssZ',
        mode='FAILFAST'
    )
    .withColumn('call_logs', explode('call_logs'))
)

In [32]:
columns = flatten(table.schema)

table.select(*columns).limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,type,id.1,name,extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 19:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,,2022-05-11 19:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 19:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 18:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,,NaT


In [38]:
from pyspark.sql.functions import col

aliases = list()

for column in columns:
    tokens = column.split('.')
    if len(tokens) > 2:
        alias = '_'.join(tokens[1:])
    else:
        alias = tokens[-1]
    
    aliases.append(col(column).alias(alias))

In [59]:
table.select(*aliases).limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 19:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,,2022-05-11 19:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 19:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 18:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,,NaT


In [61]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit, trim, when

def truncateStrings(dataFrame: DataFrame) -> DataFrame:
    for colName, dtype in dataFrame.dtypes:
        if dtype == 'string':
            column = trim(colName)
            case = when(column == '', lit(None).cast('string')).otherwise(column)
            dataFrame = dataFrame.withColumn(colName, case)

    return dataFrame

In [81]:
from pyspark.sql.functions import regexp_replace

table = (
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        schema=schema,
        timestampFormat='yyyy-MM-ddTHH:mm:ssZ',
        mode='FAILFAST'
    )
    .withColumn('call_logs', explode('call_logs'))
    .select(*aliases)
)


(
    truncateStrings(table)
    .withColumn('caller_location', regexp_replace('caller_location', r'\s+', ' '))
    .withColumn('charge', regexp_replace('charge', r'[^0-9\.]', '').cast('double'))
    .withColumn('rate', regexp_replace('rate', r'[^0-9\.]', '').cast('double'))
    .write
    .mode('overwrite')
    .parquet('/tmp/AccountCallLogs')
)


table = spark.read.parquet('/tmp/AccountCallLogs')
table.limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11 19:36:35,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,NaT,2022-05-11 19:38:41
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:28:25,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11 19:21:14,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,NaT,NaT
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11 19:00:02,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,NaT,NaT
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11 18:47:33,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,NaT,NaT


### Cardinality

In [None]:
[
    'call_type', # pstn, voip, international
    'caller_number_source', # null, international
    'caller_location', # null, ...
    'callee_number_source', # international, null
    'callee_location', # United States, null, ...
    'direction', # outbound, inbound
    'result', # Call connected, No Answer, Call Cancel, Rejected, Call failed
    'path', # pstn, autoReceptionist, extension, callQueue
    'owner_type', # user, autoReceptionist, callQueue
    'caller_country_code', # 1, ...
    'caller_country_iso_code', # US, null, GB, 001, CN, CR
    'callee_country_code', # 1, ...
    'callee_country_iso_code', # US, null, CA
]

In [94]:
from pyspark.sql.functions import desc

# table.where('answer_start_time is not null').select('answer_start_time').show()

# table.where('charge is not null').select('charge').show()

table.groupBy('callee_country_iso_code').count().orderBy(desc('count')).show()

+-----------------------+-----+
|callee_country_iso_code|count|
+-----------------------+-----+
|                     US| 2471|
|                   null|   47|
|                     CA|    6|
|                     IN|    2|
|                     PK|    1|
|                     CR|    1|
|                     VI|    1|
|                    001|    1|
+-----------------------+-----+



### Data Types

In [46]:
data = list()

for colName, dtype in table.select(*aliases).dtypes:
    if dtype != 'string':
        data.append((colName, dtype))
        
data

[('page_size', 'int'),
 ('total_records', 'int'),
 ('from', 'date'),
 ('to', 'date'),
 ('caller_number_type', 'bigint'),
 ('callee_number_type', 'bigint'),
 ('duration', 'bigint'),
 ('waiting_time', 'bigint'),
 ('date_time', 'timestamp'),
 ('has_recording', 'boolean'),
 ('has_voicemail', 'boolean'),
 ('owner_extension_number', 'bigint'),
 ('call_end_time', 'timestamp')]

In [47]:
(
    spark
    .read
    .json(
        path='/tmp/phone-calls/AccountCallLogs.json',
        primitivesAsString=True
    )
    .withColumn('call_logs', explode('call_logs'))
    .select(*aliases)
    .write
    .mode('overwrite')
    .parquet('/tmp/AccountCallLogs')
)

_table = spark.read.parquet('/tmp/AccountCallLogs')
_table.limit(5).toPandas()

Unnamed: 0,next_page_token,page_size,total_records,from,to,id,user_id,call_type,caller_number,caller_number_type,caller_name,caller_number_source,callee_number,caller_location,callee_number_type,callee_number_source,callee_location,callee_name,direction,duration,result,waiting_time,date_time,path,has_recording,charge,voice_mail_id,has_voicemail,rate,call_id,owner_type,owner_id,owner_name,owner_extension_number,caller_did_number,caller_country_code,caller_country_iso_code,callee_did_number,callee_country_code,callee_country_iso_code,answer_start_time,call_end_time
0,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,34bfc3eb-7180-4c83-b61b-345d084f72b1,d9JiQ8qqTcCh6bfVo2YIYw,pstn,255,1,Nicole Thompson,,18287377552,,2,internal,"Newland, NC",,outbound,0,Call Cancel,,2022-05-11T23:36:35Z,pstn,False,,,False,,7096626840318811841,user,d9JiQ8qqTcCh6bfVo2YIYw,Nicole Thompson,255,16167290255.0,1,US,18287377552,1,US,,2022-05-11T23:38:41Z
1,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,d3d79bc4-22cd-4936-9889-e10f815f45ba,,voip,16162081843,2,Amerisave,internal,300,,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11T23:28:25Z,autoReceptionist,False,,,False,,7096624731490955678,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,
2,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,2b8ca4c8-7adc-436a-a2b5-17b0aea69a20,,voip,15165658083,2,,internal,300,Garden City NY,1,,,Main Auto Receptionist,inbound,0,No Answer,,2022-05-11T23:21:14Z,autoReceptionist,False,,,False,,7096622880360005894,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,300,,1,US,16162576300,1,US,,
3,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,fbc302fc-4082-4cb5-bb06-f99b603f3392,1avZmyzLRzu2AInJp75T1Q,voip,12312656202,2,2312656202,internal,226,,1,,,Jessica Osborn,inbound,0,No Answer,,2022-05-11T23:00:02Z,pstn,False,,,False,,7096617421447184140,user,1avZmyzLRzu2AInJp75T1Q,Jessica Osborn,226,,1,US,16162572026,1,US,,
4,MNI3JNNEPEyFrnxeuneeLaoRqOrnm1XNOQ2,300,2530,2022-05-09,2022-05-12,a0d06ee4-4472-40f6-a793-2edadca307f6,bkasEu6zRz2YKurlHXnIqw,voip,17022280222,2,RR Partners,internal,208,,1,,,Jessica Caggiano,inbound,0,No Answer,,2022-05-11T22:47:33Z,pstn,False,,,False,,7096614200270916926,user,bkasEu6zRz2YKurlHXnIqw,Jessica Caggiano,208,,1,US,16162572008,1,US,,


In [None]:
[
    ('page_size', 'int'),
    ('total_records', 'int'),
    ('from', 'date'),
    ('to', 'date'),
    ('caller_number_type', 'bigint'),
    ('callee_number_type', 'bigint'),
    ('duration', 'bigint'),
    ('waiting_time', 'bigint'),
    ('date_time', 'timestamp'),
    ('has_recording', 'boolean'),
    ('has_voicemail', 'boolean'),
    ('owner_extension_number', 'bigint'),
    ('call_end_time', 'timestamp')
]

In [49]:
bigint = [
    'caller_number_type',
    'callee_number_type',
    'duration',
    'waiting_time',
    
    'owner_extension_number'
]

_table.select(*bigint).limit(5).toPandas()

Unnamed: 0,caller_number_type,callee_number_type,duration,waiting_time,owner_extension_number
0,1,2,0,,255
1,2,1,0,,300
2,2,1,0,,300
3,2,1,0,,226
4,2,1,0,,208


In [50]:
_table.where(col('owner_extension_number').startswith('0')).count()

0

In [56]:
for column in bigint:
    # maximum = _table.selectExpr(f'max(cast({column} as bigint))').first()[0]
    maximum = _table.where(col(column).rlike('[^0-9]')).count()
    print(column, maximum)

caller_number_type 0
callee_number_type 0
duration 0
waiting_time 0
owner_extension_number 0


In [48]:
(
    _table
    .groupBy('has_recording', 'has_voicemail')
    .count()
    .show()
)

+-------------+-------------+-----+
|has_recording|has_voicemail|count|
+-------------+-------------+-----+
|        false|         true|    1|
|        false|        false| 2504|
|        false|         null|   25|
+-------------+-------------+-----+

