In [1]:
import numpy as np
from pyspark import SparkContext

sc = SparkContext("local", "pyspark")

Implementing Matrix Multiplication using Spark RDD's **transformation** and **action** operations

Initializing two random matricies of dimensions (2, 3) and (3, 2)

In [2]:
A = np.random.rand(2, 3)
B = np.random.rand(3, 2)

In [3]:
A

array([[ 0.39156228,  0.72451013,  0.88690269],
       [ 0.89446377,  0.63683496,  0.35296409]])

In [4]:
B

array([[ 0.58602112,  0.32946682],
       [ 0.26470845,  0.18467109],
       [ 0.72489533,  0.740035  ]])

Their product is

In [5]:
A.dot(B)

array([[ 1.06415933,  0.91914188],
       [ 0.94861227,  0.67350692]])

Turn them into Spark RDD:

In [6]:
rddA = sc.parallelize(list(enumerate(A.T)))
rddA = rddA.flatMapValues(lambda x: list(enumerate(x)))

rddB = sc.parallelize(list(enumerate(B)))
rddB = rddB.flatMapValues(lambda x: list(enumerate(x)))

This is what they look like:

In [7]:
rddA.collect()

[(0, (0, 0.3915622771711913)),
 (0, (1, 0.89446376665148264)),
 (1, (0, 0.72451013433657874)),
 (1, (1, 0.63683496124382855)),
 (2, (0, 0.88690268501991776)),
 (2, (1, 0.35296408961292414))]

In [8]:
rddB.collect()

[(0, (0, 0.5860211224614541)),
 (0, (1, 0.32946681974700209)),
 (1, (0, 0.26470844611501321)),
 (1, (1, 0.18467109202589937)),
 (2, (0, 0.72489532702875747)),
 (2, (1, 0.74003499929126915))]

Join them on A's column index (or B's row index), and do dot-product on A's row vectors with B's column vectors:

In [9]:
C = rddA.join(rddB).map(lambda x: ((x[1][0][0], x[1][1][0]),
                                   x[1][0][1] * x[1][1][1])).reduceByKey(lambda x, y: x + y)

C.collect()

[((0, 0), 1.0641593289364362),
 ((1, 1), 0.6735069201173014),
 ((0, 1), 0.9191418837643931),
 ((1, 0), 0.94861227272613879)]

Finally, clean up the result **C**:

In [10]:
C = C.map(lambda x: (x[0][0],(x[0][1], x[1]))).groupByKey()
C = C.mapValues(list).mapValues(lambda x: sorted(x, key=lambda y: y[0]))
C = C.mapValues(lambda x: zip(*x)[1])

C = np.array(C.sortByKey().map(lambda x: np.array(x[1])).collect())

In [11]:
C

array([[ 1.06415933,  0.91914188],
       [ 0.94861227,  0.67350692]])

which equals A.dot(B):

In [12]:
A.dot(B)

array([[ 1.06415933,  0.91914188],
       [ 0.94861227,  0.67350692]])

<img src="MatrixMulSparkRDD1.png">

<img src="MatrixMulSparkRDD2.png">

<img src="MatrixMulSparkRDD3.png">

<img src="MatrixMulSparkRDD4.png">