In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master('local[*]')
    .appName('PySparkShell')
    .getOrCreate()
)

In [2]:
import sys
import os

sys.path.append('/dbfs/cdr/library')

from pandas import set_option
set_option('display.max.columns', None)

from sparkFunctions import create_tmp_parquet

# TODO: remove for Databricks
from dbutils import DBUtils, FileInfo
dbutils = DBUtils()

In [3]:
FOLDER = '/automation'

ZOOM_FOLDER = os.path.join(FOLDER, 'ZoomData')

In [6]:
files = dbutils.fs.ls(ZOOM_FOLDER)

file = [file for file in files if file.name.endswith('.json')][0]
file

FileInfo(path='/dbfs/automation/ZoomData/AccountCallLogs.json', name='AccountCallLogs.json', size=296142)

In [221]:
table = spark.read.json(path=file.path, primitivesAsString=False)

In [222]:
table.limit(5).toPandas()

Unnamed: 0,call_logs,from,next_page_token,page_size,to,total_records
0,"[(None, None, 7013348025745800791, voip, None,...",2021-09-01,RTnPbKa5l4hV5z2ECxpLrgJpClCEBq4dDD2,300,2021-09-29,16671
1,"[(None, None, 7013072752701961432, voip, None,...",2021-09-01,rnbcgO6YYkLDunmmmgOBoHSP0ZmBQGDjQe3,300,2021-09-29,16671
2,"[(None, 2021-09-28T17:16:28Z, 7013034591916506...",2021-09-01,AhIFvYLdOJzBIXB30WD0NQR8HdRomKYBLy4,300,2021-09-29,16671
3,"[(None, None, 7012984770287400321, voip, None,...",2021-09-01,kXFA5x05dGl6AjPISTCEqoWnZ4180anGTI5,300,2021-09-29,16671
4,"[(None, 2021-09-27T19:36:17Z, 7012699240869921...",2021-09-01,2FcTzhwlYi69hfRje1exbKBTR3m3rfU1V66,300,2021-09-29,16671


In [129]:
# from pyspark.sql.functions import explode

# (
#     calls
#     .select(explode('call_logs').alias('data'), 'from')
#     .select('data.*', 'from')
#     .select('owner')
#     .limit(5)
#     .show(truncate=False)
# )

In [18]:
from pyspark.sql.types import ArrayType, StructType
from typing import List

def flatten(schema: StructType, prefix: str=None) -> List[str]:
    fields = list()

    for field in schema.fields:
        # check for Struct in each column
        name = prefix + '.' + field.name if prefix else field.name

        dtype = field.dataType

        # check if column is Array
        if isinstance(dtype, ArrayType):
            dtype = dtype.elementType
        
        # check if column is Struct
        if isinstance(dtype, StructType):
            # use nested dot `.` property
            # use function recursion
            fields += flatten(dtype, prefix=name)
        else:
            fields.append(name)

    return fields

In [88]:
from pyspark.sql.types import ArrayType, StructField, StructType
from pyspark.sql.types import BooleanType, DateType, IntegerType, LongType, StringType, TimestampType

owner = StructType(
    fields=[
        StructField('type', StringType()),
        StructField('id', StringType()),
        StructField('name', StringType()),
        StructField('extension_number', IntegerType())
    ]
)


call = StructType(
    fields=[
        StructField('id', StringType()),
        StructField('user_id', StringType()),
        StructField('call_type', StringType()),
        StructField('caller_number', StringType()),
        StructField('caller_number_type', IntegerType()),
        StructField('caller_location', StringType()),
        StructField('caller_name', StringType()),
        StructField('callee_number', StringType()),
        StructField('callee_number_type', IntegerType()),
        StructField('callee_location', StringType()),
        StructField('callee_name', StringType()),
        StructField('direction', StringType()),
        StructField('duration', IntegerType()),
        StructField('result', StringType()),
        StructField('date_time', TimestampType()),
        StructField('path', StringType()),
        StructField('voice_mail_id', StringType()),
        StructField('charge', StringType()),
        StructField('rate', StringType()),
        StructField('has_recording', BooleanType()),
        StructField('has_voicemail', BooleanType()),
        StructField('call_id', StringType()),
        StructField('owner', owner), # Struct
        StructField('call_end_time', TimestampType()),
        StructField('answer_start_time', TimestampType()),
    ]
)


schema = StructType(
    fields=[
        StructField('next_page_token', StringType()),
        StructField('page_size', IntegerType()),
        StructField('total_records', IntegerType()),
        StructField('from', DateType()),
        StructField('to', DateType()),
        StructField('call_logs', ArrayType(call)) # Array
    ]
)

In [214]:
table = (
    spark
    .read
    .json(
        path=file.path,
        schema=schema,
        timestampFormat='yyyy-MM-ddTHH:mm:ssZ',
        mode='FAILFAST'
    )
    .select('call_logs')
)

table = table.select(explode('call_logs').alias('data'))

In [215]:
%%time
from pyspark.sql.functions import col, from_utc_timestamp

columns = flatten(table.schema)

names = list()
for column in columns:
    name = '_'.join(column.split('.')[1:])
    names.append(col(column).alias(name))


table = table.select(*names)

timestamps = [column for column, dtype in table.dtypes if dtype == 'timestamp']

for column in timestamps:
    table = table.withColumn(column, from_utc_timestamp(column, tz='America/New_York'))

table = table.dropDuplicates()
    
table = create_tmp_parquet(spark, table, '/tmp/CALL_LOGS')

table.limit(5).toPandas()

CPU times: user 79.6 ms, sys: 9.88 ms, total: 89.5 ms
Wall time: 7.09 s


Unnamed: 0,id,user_id,call_type,caller_number,caller_number_type,caller_location,caller_name,callee_number,callee_number_type,callee_location,callee_name,direction,duration,result,date_time,path,voice_mail_id,charge,rate,has_recording,has_voicemail,call_id,owner_type,owner_id,owner_name,owner_extension_number,call_end_time,answer_start_time
0,6cfad167-208d-46a0-9d4e-cca22dbc74aa,DF7rTs8aTlGRXfiib1Qk2g,pstn,872,1,,Brandon Rider,19894226798,2,,Keller Tracy,outbound,746,Call connected,2021-09-29 04:46:44,pstn,,$0.41340000000000005,$0.0318,False,False,7013336296189126681,user,DF7rTs8aTlGRXfiib1Qk2g,Brandon Rider,872,2021-09-29 04:59:18,NaT
1,f2046509-771a-4626-9359-0bd084d74c0d,,voip,18006787986,2,,Wells Fargo,901,1,,Main Auto Receptionist,inbound,0,No Answer,2021-09-29 04:21:20,autoReceptionist,,,,False,False,7013329746355664535,autoReceptionist,fK99-uaJRnG8yW-b9B4-9g,Main Auto Receptionist,901,NaT,NaT
2,f2be829e-22b2-4361-aea0-16910e6e7614,-6jgSVmuQxmukeufp3KDbA,pstn,950,1,,Cierra Kilpatrick,18558122239,2,Toll Free Call,,outbound,77,Recorded,2021-09-28 09:38:48,pstn,,,,,False,7013040476021640737,user,-6jgSVmuQxmukeufp3KDbA,Cierra Kilpatrick,950,2021-09-28 09:40:08,NaT
3,93fddf8d-2975-4b1c-b47e-12f363c2ac40,mV6MtwTWSmSpTCGD4ZhAIQ,pstn,474,1,,Yodit Kahssai,18669417199,2,,Amn Healthcare,outbound,7,Call connected,2021-09-28 07:26:31,pstn,,,,False,False,7013006386866222626,user,mV6MtwTWSmSpTCGD4ZhAIQ,Yodit Kahssai,474,2021-09-28 07:26:41,NaT
4,61f119b7-7105-4315-bf0f-3dce9d9b56c6,r0Kgyt9FS3eqIo6KhEVIvg,pstn,852,1,,Xavier Baron,18002773921,2,Toll Free Call,,outbound,0,Call Cancel,2021-09-28 07:20:56,pstn,,,,False,False,7013004948052122821,user,r0Kgyt9FS3eqIo6KhEVIvg,Xavier Baron,852,2021-09-28 07:21:49,NaT


In [None]:
spark.stop()