# Aggregating and summarizing data into useful reports
## Loading the data

In [None]:
from pyspark import SparkContext
sc = SparkContext('local', 'BigData PySpark') 


## Calculating averages with map and reduce

In [None]:
raw_data = sc.textFile("./kddcup.data.gz")
raw_data

In [None]:
csv = raw_data.map(lambda x: x.split(","))
normal_data = csv.filter(lambda x: x[41]=="normal.")
duration = normal_data.map(lambda x: int(x[0]))
total_duration = duration.reduce(lambda x, y: x+y)
total_duration

In [None]:
total_duration/(normal_data.count())

## Faster average computation with aggregate

In [None]:
duration_count = duration.aggregate(
    (0,0),
    (lambda db, new_value: (db[0] + new_value, db[1] + 1)),
    (lambda db1, db2: (db1[0] + db2[0], db1[1] + db2[1]))
)
duration_count[0]/duration_count[1]

## Pivot tabling with key-value paired data points

In [None]:
kv = csv.map(lambda x: (x[41], x))
kv.take(1)

In [None]:
kv_duration = csv.map(lambda x: (x[41], float(x[0]))).reduceByKey(lambda x, y: x + y)
kv_duration.collect()

In [None]:
kv.countByKey()