# prepare data

In [1]:
import os
import csv
import uuid
import random
import json

from pyspark import SparkConf, SparkContext

default_nes = [ 'ne_{}'.format(str(uuid.uuid4())) for x in range(10) ]
default_users = [ 'user_{}'.format(str(uuid.uuid4())) for x in range(10)]

fields = [ ['f1','f2'],['f3','f4'],['f5','f6'],['f7','f8']]
    
xdrs = ['xdr1','xdr2','xdr3','xdr4']

all_data = {}

for xdr, field in zip(xdrs,fields):
    xdr_records = []
    for i in range(100):
        row = {}
        row['ne'] = random.choice(default_nes)
        row['user'] = random.choice(default_users)
        row[field[0]] = random.randrange(0,100)
        row[field[1]] = random.randrange(0,100)
        xdr_records.append(row)
    all_data[xdr]= xdr_records
    
for xdr in xdrs:
    with open(f'{xdr}.json','w') as f:
        f.write(json.dumps(all_data[xdr], indent=2))

# initialize spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Word Count").getOrCreate()

# load data into xdr df

In [10]:
dfs = []
for xdr in xdrs:
    df = spark.read.format('json').option("multiLine", True).load(f'{xdr}.json')
    dfs.append(df)

In [22]:
dfs[0].show(5)

+---+---+--------------------+--------------------+
| f1| f2|                  ne|                user|
+---+---+--------------------+--------------------+
| 67| 76|ne_c735e1f0-f3ab-...|user_4f9640db-f0e...|
| 35| 36|ne_f04458d4-6962-...|user_dfa6d6cb-311...|
| 71| 39|ne_dc685152-7327-...|user_a417346c-fc6...|
| 32| 86|ne_c735e1f0-f3ab-...|user_7c89f2a0-bde...|
| 68| 81|ne_544cab6d-9d79-...|user_91d1a3a8-63d...|
+---+---+--------------------+--------------------+
only showing top 5 rows



# zero padding

In [39]:
dfa = dfs[0].join(dfs[1], ['ne','user'], how='outer') \
    .join(dfs[2], ['ne','user'], how='outer') \
    .join(dfs[3], ['ne','user'], how='outer') \
    .na.fill(0)

In [40]:
dfa.show()

+--------------------+--------------------+---+---+---+---+---+---+---+---+
|                  ne|                user| f1| f2| f3| f4| f5| f6| f7| f8|
+--------------------+--------------------+---+---+---+---+---+---+---+---+
|ne_f04458d4-6962-...|user_4f9640db-f0e...|  0|  0| 46| 40| 45| 35| 63| 51|
|ne_f04458d4-6962-...|user_4f9640db-f0e...|  0|  0| 55| 64| 45| 35| 63| 51|
|ne_08f2f7de-03a8-...|user_3a0ee37d-f76...|  0|  0|  0|  0|  0|  0| 68| 51|
|ne_c735e1f0-f3ab-...|user_91d1a3a8-63d...| 11| 59| 38| 61| 83| 95| 43| 73|
|ne_c735e1f0-f3ab-...|user_91d1a3a8-63d...| 11| 59| 38| 61| 23| 75| 43| 73|
|ne_544cab6d-9d79-...|user_cddac42f-014...|  0|  0| 50| 88|  0|  0| 88|  3|
|ne_544cab6d-9d79-...|user_cddac42f-014...|  0|  0| 50| 88|  0|  0|  8|  3|
|ne_544cab6d-9d79-...|user_cddac42f-014...|  0|  0| 64| 11|  0|  0| 88|  3|
|ne_544cab6d-9d79-...|user_cddac42f-014...|  0|  0| 64| 11|  0|  0|  8|  3|
|ne_544cab6d-9d79-...|user_4f9640db-f0e...| 31|  5| 50| 74| 96| 49|  0|  0|
|ne_544cab6d

In [41]:
dfa.groupBy(['ne','user']).sum().show()

+--------------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|                  ne|                user|sum(f1)|sum(f2)|sum(f3)|sum(f4)|sum(f5)|sum(f6)|sum(f7)|sum(f8)|
+--------------------+--------------------+-------+-------+-------+-------+-------+-------+-------+-------+
|ne_f04458d4-6962-...|user_4f9640db-f0e...|      0|      0|    101|    104|     90|     70|    126|    102|
|ne_08f2f7de-03a8-...|user_3a0ee37d-f76...|      0|      0|      0|      0|      0|      0|     68|     51|
|ne_c735e1f0-f3ab-...|user_91d1a3a8-63d...|     22|    118|     76|    122|    106|    170|     86|    146|
|ne_544cab6d-9d79-...|user_cddac42f-014...|      0|      0|    228|    198|      0|      0|    192|     12|
|ne_544cab6d-9d79-...|user_4f9640db-f0e...|    107|     79|    150|    222|    288|    147|      0|      0|
|ne_3461c6de-e98a-...|user_17b406a5-c6c...|      0|      0|     46|    142|     68|     24|      0|      0|
|ne_544cab6d-9d79-...|user_f