## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the matrix multiplication example in the INFOH515 slides "Map-reduce analytics" 


In [2]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum
userName = getpass.getuser()
appName = pwd.getpwuid( os.getuid() )[ 0 ]


# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/17 16:15:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/17 16:15:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/17 16:15:51 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/03/17 16:15:51 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/03/17 16:15:51 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [12]:
sc.version

'3.5.5'

In [13]:
def mapstep(x):
    L=[] 
    matname=x[0]
    if matname=='M':
        i=int(x[1])
        for j in range(1,MC+1):
            L=L+[((int(i),k),(j,matname,int(x[1+j]))) for k in range(1,NC+1)]
    if matname=='N':
        j=int(x[1])
        for k in range(1,NC+1):
            L=L+[((int(i),k),(j,matname,int(x[1+k]))) for i in range(1,MR+1)]
    return(L)

In [14]:
def redstep(x):
    L=len(x)/3
    i=0
    vM=np.zeros(MC)
    vN=np.zeros(MC)
    while i <len(x):
        if x[i+1]=='M':
            vM[x[i]-1]=x[i+2]
        if x[i+1]=='N':
            vN[x[i]-1]=x[i+2]
        i=i+3
    return(np.sum(vM*vN))

In [15]:
matr = sc.textFile("matrix").map(lambda x : x.split(","))

### Note
The first element of each row designs the related matrix.
The second element is the rownumber.
The other elements are the matrix terms.

In [16]:
matr.collect()

[['M', '1', '1', '2'],
 ['M', '2', '3', '4'],
 ['M', '3', '5', '6'],
 ['N', '1', '-1', '-2', '-3', '-4'],
 ['N', '2', '-5', '-6', '-7', '-8']]

In [17]:
MR=matr.filter(lambda L: L[0]=='M').count()
NR=matr.filter(lambda L: L[0]=='N').count()
MC=len(matr.filter(lambda L: L[0]=='M').take(1)[0])-2
NC=len(matr.filter(lambda L: L[0]=='N').take(1)[0])-2

In [18]:
matr.flatMap(lambda x:mapstep(x) ).collect()

[((1, 1), (1, 'M', 1)),
 ((1, 2), (1, 'M', 1)),
 ((1, 3), (1, 'M', 1)),
 ((1, 4), (1, 'M', 1)),
 ((1, 1), (2, 'M', 2)),
 ((1, 2), (2, 'M', 2)),
 ((1, 3), (2, 'M', 2)),
 ((1, 4), (2, 'M', 2)),
 ((2, 1), (1, 'M', 3)),
 ((2, 2), (1, 'M', 3)),
 ((2, 3), (1, 'M', 3)),
 ((2, 4), (1, 'M', 3)),
 ((2, 1), (2, 'M', 4)),
 ((2, 2), (2, 'M', 4)),
 ((2, 3), (2, 'M', 4)),
 ((2, 4), (2, 'M', 4)),
 ((3, 1), (1, 'M', 5)),
 ((3, 2), (1, 'M', 5)),
 ((3, 3), (1, 'M', 5)),
 ((3, 4), (1, 'M', 5)),
 ((3, 1), (2, 'M', 6)),
 ((3, 2), (2, 'M', 6)),
 ((3, 3), (2, 'M', 6)),
 ((3, 4), (2, 'M', 6)),
 ((1, 1), (1, 'N', -1)),
 ((2, 1), (1, 'N', -1)),
 ((3, 1), (1, 'N', -1)),
 ((1, 2), (1, 'N', -2)),
 ((2, 2), (1, 'N', -2)),
 ((3, 2), (1, 'N', -2)),
 ((1, 3), (1, 'N', -3)),
 ((2, 3), (1, 'N', -3)),
 ((3, 3), (1, 'N', -3)),
 ((1, 4), (1, 'N', -4)),
 ((2, 4), (1, 'N', -4)),
 ((3, 4), (1, 'N', -4)),
 ((1, 1), (2, 'N', -5)),
 ((2, 1), (2, 'N', -5)),
 ((3, 1), (2, 'N', -5)),
 ((1, 2), (2, 'N', -6)),
 ((2, 2), (2, 'N', -6)),

In [19]:
res=matr.flatMap(lambda x:mapstep(x) ).reduceByKey(lambda a,b:a+b).map(lambda a: (a[0],redstep(a[1])))

In [20]:
res.collect()

[((1, 1), -11.0),
 ((1, 3), -17.0),
 ((2, 2), -30.0),
 ((2, 4), -44.0),
 ((3, 1), -35.0),
 ((3, 3), -57.0),
 ((1, 2), -14.0),
 ((1, 4), -20.0),
 ((2, 1), -23.0),
 ((2, 3), -37.0),
 ((3, 2), -46.0),
 ((3, 4), -68.0)]

## Numpy validation of the matrix multiplication result

In [29]:
M=np.array([[1,2],[3,4],[5,6]])
N=np.array([[-1,-2,-3,-4], [-5,-6,-7,-8]])
M@N

array([[-11, -14, -17, -20],
       [-23, -30, -37, -44],
       [-35, -46, -57, -68]])

## Other example

In [4]:
n=3 # number of features
N=10 # number of samples

X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)

matrix_elements = []
for i, row in enumerate(X):
    for j, value in enumerate(row):
        matrix_elements.append((i, j, value))

# Create RDD of ((row_idx, col_idx), matrix_value)
rddM = sc.parallelize([((i, 'M', j), value) for i, j, value in matrix_elements])
rddM.collect()

                                                                                

[((0, 'M', 0), -1.0267839112213166),
 ((0, 'M', 1), 0.40418962183654544),
 ((0, 'M', 2), 1.4344291682978654),
 ((1, 'M', 0), -1.547741638766201),
 ((1, 'M', 1), 0.07064114955114449),
 ((1, 'M', 2), -0.03870959999210331),
 ((2, 'M', 0), -0.8583914637266914),
 ((2, 'M', 1), 1.5356852794323717),
 ((2, 'M', 2), 1.1564067847240136),
 ((3, 'M', 0), 0.3818106444300763),
 ((3, 'M', 1), -0.045567115163714526),
 ((3, 'M', 2), 0.5830084208757632),
 ((4, 'M', 0), 1.130297091469962),
 ((4, 'M', 1), 0.1807045435287953),
 ((4, 'M', 2), 0.045113895004528115),
 ((5, 'M', 0), -2.159430038630071),
 ((5, 'M', 1), 1.5104154378663908),
 ((5, 'M', 2), -0.18363625160040054),
 ((6, 'M', 0), -1.3883667518513452),
 ((6, 'M', 1), -0.6068355558190871),
 ((6, 'M', 2), -0.4427459168542967),
 ((7, 'M', 0), 0.5659373287489623),
 ((7, 'M', 1), 1.3812789886432706),
 ((7, 'M', 2), -0.2667236976570106),
 ((8, 'M', 0), 0.2608451855219726),
 ((8, 'M', 1), -0.39543096492910185),
 ((8, 'M', 2), -1.2777287226365144),
 ((9, 'M'