In [64]:
import random
from pyspark import SparkContext
from pyspark.sql import SQLContext

import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import Window

import json
import numpy as np
import pandas as pd
from datetime import datetime
import re

sc = SparkContext.getOrCreate()
spark = SQLContext(sc)
spark.setConf("spark.sql.debug.maxToStringFields", 10000)

In [65]:
def calc_data(data):
    test3 = data

    tg_det_layer_1 = test3.filter(
        (test3["SRC"].isin(["CHA", "PSA", "ROAM", "POST"]))
        & (test3["CALL_TYPE_CODE"].isin(["V"]))
        & ((test3["COUNTED_CDR_IND"] == "1") | (test3["CHARGE_AMT"] != 0))
        & (test3["CONNECTION_TYPE_KEY"] == "3")
        & (test3["CALL_DIRECTION_IND"] == "2")
        & (test3["ACTUAL_CALL_DURATION_SEC"] > "0")
        & (test3.CALLING_NUM.like("7%"))
        & (test3["TERM_PARENT_OPERATOR_CODE"] == "130")
        & (test3["ORIG_PARENT_OPERATOR_CODE"] == "130")
    )

    tg_det_layer_2 = test3.filter(
        (test3["SRC"].isin(["CHA", "PSA", "ROAM", "POST"]))
        & (test3["CALL_TYPE_CODE"].isin(["V"]))
        & ((test3["COUNTED_CDR_IND"] == "1") | (test3["CHARGE_AMT"] != 0))
        & (test3["CONNECTION_TYPE_KEY"] == "3")
        & (test3["CALL_DIRECTION_IND"] == "1")
        & (test3["ACTUAL_CALL_DURATION_SEC"] > "0")
        & (test3.CALLED_NUM.like("7%"))
        & (test3["ORIG_PARENT_OPERATOR_CODE"] == "130")
        & (test3["TERM_PARENT_OPERATOR_CODE"] == "130")
    )

    tg_det_layer_1 = tg_det_layer_1.select(
        F.col("CALL_START_TIME").cast("date"),
        "NUMBER_ID",
        "SYSTEM_ID",
        F.col("CALLED_NUM").alias("CONTACT"),
        "ACTUAL_CALL_DURATION_SEC",
        F.lit(2).alias("type"),
    )
    tg_det_layer_2 = tg_det_layer_2.select(
        F.col("CALL_START_TIME").cast("date"),
        "NUMBER_ID",
        "SYSTEM_ID",
        F.col("CALLING_NUM").alias("CONTACT"),
        "ACTUAL_CALL_DURATION_SEC",
        F.lit(1).alias("type"),
    )
    tg_det_layer = tg_det_layer_1.union(tg_det_layer_2)

    tg_det_layer = tg_det_layer.filter((F.length('CONTACT') == 10) | (F.length('CONTACT') == 11))
    tg_det_layer = tg_det_layer.withColumn('CONTACT', F.when(F.length('CONTACT') == 11, tg_det_layer.CONTACT.substr(2, 10)).otherwise(tg_det_layer['CONTACT']))
    tg_det_layer = tg_det_layer.filter(tg_det_layer['CONTACT'].startswith('7'))

    tg_det_layer.registerTempTable("det_trafic")

    sql_agg_num = """
                select trunc(CALL_START_TIME,'MM') as TIME_KEY,
            NUMBER_ID,
                    SYSTEM_ID,
                    CONTACT,
                    count(CONTACT) as count,
                    sum(ACTUAL_CALL_DURATION_SEC) as sum_durat,
                    sum(CASE
                            WHEN type = 1 THEN 1
                            ELSE 0
                        END) as count_in,
                    sum(CASE
                            WHEN type = 2 THEN 1
                            ELSE 0
                        END) as count_out,
                    sum(CASE
                            WHEN type = 1 THEN ACTUAL_CALL_DURATION_SEC
                            ELSE 0
                        END) as sum_durat_in,
                    sum(CASE
                            WHEN type = 2 THEN ACTUAL_CALL_DURATION_SEC
                            ELSE 0
                        END) as sum_durat_out,
                    (sum(CASE
                                WHEN type = 1 THEN ACTUAL_CALL_DURATION_SEC
                                ELSE 0
                            END)/sum(CASE
                                        WHEN type = 1 THEN 1
                                        ELSE 0
                                    END)) as avg_durat_in,
                    (sum(CASE
                                WHEN type = 2 THEN ACTUAL_CALL_DURATION_SEC
                                ELSE 0
                            END)/sum(CASE
                                        WHEN type = 2 THEN 1
                                        ELSE 0
                                    END)) as avg_durat_out
                from det_trafic
                group by trunc(CALL_START_TIME,'MM'),
                NUMBER_ID,
                        SYSTEM_ID,
                        CONTACT
                order by NUMBER_ID,
                        SYSTEM_ID,
                        count DESC
                """

    sql_agg_count = spark.sql(sql_agg_num)
    sql_agg_count.registerTempTable("num_agg_count")

    sql_agg_r = """
                select * ,
                    ROW_NUMBER() OVER (PARTITION BY TIME_KEY, NUMBER_ID, SYSTEM_ID ORDER BY NUMBER_ID, SYSTEM_ID, count DESC, sum_durat DESC) AS RowNum
                        from num_agg_count
                """

    sql_agg_rank = spark.sql(sql_agg_r)
    sql_agg_rank.registerTempTable("num_agg_rank")

    sql_agg_r_10 = """
                select *
                        from num_agg_rank
                        where RowNum < 31
                """

    sql_agg_rank_10 = spark.sql(sql_agg_r_10)

    
    return sql_agg_rank_10.explain("formatted")

In [66]:
data = spark.read.csv("../data/second_dataset.csv", header=True)

In [67]:
calc_data(data)

== Physical Plan ==
AdaptiveSparkPlan (17)
+- Filter (16)
   +- Window (15)
      +- Sort (14)
         +- Exchange (13)
            +- Sort (12)
               +- Exchange (11)
                  +- HashAggregate (10)
                     +- Exchange (9)
                        +- HashAggregate (8)
                           +- Union (7)
                              :- Project (3)
                              :  +- Filter (2)
                              :     +- Scan csv  (1)
                              +- Project (6)
                                 +- Filter (5)
                                    +- Scan csv  (4)


(1) Scan csv 
Output [14]: [SRC#3872, CALL_START_TIME#3873, SYSTEM_ID#3874, NUMBER_ID#3875, CALL_DIRECTION_IND#3879, COUNTED_CDR_IND#3884, CALL_TYPE_CODE#3887, CONNECTION_TYPE_KEY#3888, ORIG_PARENT_OPERATOR_CODE#3895, TERM_PARENT_OPERATOR_CODE#3897, CALLED_NUM#3902, CALLING_NUM#3903, ACTUAL_CALL_DURATION_SEC#3909, CHARGE_AMT#3918]
Batched: false
Location: InMemoryFi