## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the max in the INFOH515 slides "Map-reduce analytics" 

In [10]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum

# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

In [24]:

np.random.seed(1225)   

n=3 # number of features
N=10 # number of samples

X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
print("rwo max=",np.max(X,axis=1))

print("global max=",np.max(X))

rwo max= [1.84505492 0.34615021 0.67023779 1.22773916 0.6629516  0.77881069
 0.89400835 2.91200029 1.89004016 1.87065612]
global max= 2.912000292377562


## Map-reduce without key

In [25]:
rddX=sc.parallelize(X)
rddXmax=rddX.map(lambda x: np.max(x))
rddXmax.take(5)


[1.8450549182066196,
 0.3461502146150345,
 0.6702377922137547,
 1.2277391634174657,
 0.6629515977263967]

In [26]:
rddXmax.reduce( lambda x, y: np.max((x,y)))

2.912000292377562

## Map-reduce with key and reduce


In [35]:
rddX=sc.parallelize(X)
rddXmax=rddX.map(lambda x: (1,np.max(x)))
rddXmax.take(5)



[(1, 1.8450549182066196),
 (1, 0.3461502146150345),
 (1, 0.6702377922137547),
 (1, 1.2277391634174657),
 (1, 0.6629515977263967)]

In [37]:
rddXmax.reduce( lambda x, y: (x[0],np.max((x[1],y[1]))))

(1, 2.912000292377562)

## Map-reduce with key and reduceByKey

In [38]:
rddXmax.reduceByKey( lambda x, y: np.max((x,y))).collect()

[(1, 2.912000292377562)]

## MapValues

In [33]:
#Create an RDD of key-value pairs: (row_index, row_values)
rddX2 = sc.parallelize([(i, row) for i, row in enumerate(X)])
rddX2.mapValues( lambda x: np.max(x)).take(5)

[(0, 1.8450549182066196),
 (1, 0.3461502146150345),
 (2, 0.6702377922137547),
 (3, 1.2277391634174657),
 (4, 0.6629515977263967),
 (5, 0.7788106909114473),
 (6, 0.89400835058669),
 (7, 2.912000292377562),
 (8, 1.8900401640390256),
 (9, 1.870656121573813)]

In [66]:
sc.stop()