# prepare data

In [1]:
import os
import csv
import uuid
import random
import json

from pyspark import SparkConf, SparkContext

default_nes = [ 'ne_{}'.format(str(uuid.uuid4())) for x in range(10) ]
default_users = [ 'user_{}'.format(str(uuid.uuid4())) for x in range(10)]

fields = [ ['f1','f2'],['f3','f4'],['f5','f6'],['f7','f8']]
    
xdrs = ['xdr1','xdr2','xdr3','xdr4']

all_data = {}

for xdr, field in zip(xdrs,fields):
    xdr_records = []
    for i in range(100):
        row = {}
        row['ne'] = random.choice(default_nes)
        row['user'] = random.choice(default_users)
        row[field[0]] = random.randrange(1,100)
        row[field[1]] = random.randrange(1,100)
        xdr_records.append(row)
    all_data[xdr]= xdr_records
    
for xdr in xdrs:
    with open(f'{xdr}.json','w') as f:
        f.write(json.dumps(all_data[xdr], indent=2))

# initialize spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("zeropadding").getOrCreate()

# load data into xdr df

In [3]:
dfs = []
for xdr in xdrs:
    df = spark.read.format('json').option("multiLine", True).load(f'{xdr}.json')
    dfs.append(df)

In [4]:
dfs[0].show(5)

+---+---+--------------------+--------------------+
| f1| f2|                  ne|                user|
+---+---+--------------------+--------------------+
|  3| 44|ne_a64170bb-555f-...|user_63c7dd82-268...|
|  5| 22|ne_a4c9d604-2650-...|user_1bcecde6-47d...|
| 32|  9|ne_a64170bb-555f-...|user_ab81dfdf-0c0...|
| 71|  7|ne_0ca66591-4041-...|user_9e9dc071-d8f...|
| 99| 64|ne_31d83705-ca32-...|user_6b8148e8-c96...|
+---+---+--------------------+--------------------+
only showing top 5 rows



# zero padding

In [5]:
dfa = dfs[0].join(dfs[1], ['ne','user'], how='outer') \
    .join(dfs[2], ['ne','user'], how='outer') \
    .join(dfs[3], ['ne','user'], how='outer') \
    .na.fill(0)

In [6]:
dfa.show()

+--------------------+--------------------+---+---+---+---+---+---+---+---+
|                  ne|                user| f1| f2| f3| f4| f5| f6| f7| f8|
+--------------------+--------------------+---+---+---+---+---+---+---+---+
|ne_a4c9d604-2650-...|user_288d7cb2-604...|  5|  1| 74| 27|  0|  0| 44| 77|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 40| 83| 74| 34| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 40| 83|  4| 59| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 59| 50| 74| 34| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 59| 50|  4| 59| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 25| 63| 74| 34| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 60| 90| 25| 63|  4| 59| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 80| 33| 40| 83| 74| 34| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 80| 33| 40| 83|  4| 59| 59| 94|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...| 80| 33| 59| 50| 74| 34| 59| 94|
|ne_1adbb543

In [7]:
dfa.groupBy(['ne','user']).sum().show()

+--------------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|                  ne|                user|sum(f1)|sum(f2)|sum(f3)|sum(f4)|sum(f5)|sum(f6)|sum(f7)|sum(f8)|
+--------------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|ne_a4c9d604-2650-...|user_288d7cb2-604...|      5|      1|     74|     27|      0|      0|     44|     77|
|ne_1adbb543-a359-...|user_ab81dfdf-0c0...|    840|    738|    496|    784|    468|    558|    708|   1128|
|ne_0b58c438-ed1f-...|user_6b60b63f-9b8...|    164|     14|      0|      0|      0|      0|      0|      0|
|ne_31d83705-ca32-...|user_a0615eb4-c2c...|      0|      0|    201|    213|      0|      0|    145|     99|
|ne_3727e4b6-e12d-...|user_3b1e4817-25d...|    187|    134|      0|      0|    156|    204|    264|    159|
|ne_0cd3bc29-f84a-...|user_3b1e4817-25d...|     26|     19|     54|     62|      0|      0|     80|     41|
|ne_a64170bb-555f-...|user_6