- Python 3.6
- findspark==1.4.2
- pyspark==3.0.0
- scipy == 1.5.1
- numpy == 1.19.0

In [4]:
import numpy as np
from random import random
from math import log2
from scipy.io import loadmat

In [5]:
import findspark
findspark.init()

In [6]:
import sys
from operator import add

In [7]:
import pyspark

In [8]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

### Data-Files Generation

Dataset Reference:
> https://github.com/jhh37/lrmf-datasets

In [9]:
def generate_data(savepath, loadpath):
    matrix = loadmat(loadpath)
    matrix = matrix['M']
    matrix[np.isnan(matrix)] = 0
    norm_factor = max(matrix.min(), matrix.max(), key=abs)
    matrix /= norm_factor
    matrix = matrix.T
    double_matrix = np.concatenate((matrix,matrix),axis=0)
    triple_mat = np.concatenate((matrix,matrix,matrix),axis=0)
    np.savetxt(savepath + 'jester2_row.txt',matrix,fmt='%.2f')
    np.savetxt(savepath + 'jester2_row_b1.txt',double_matrix,fmt='%.2f')
    np.savetxt(savepath + 'jester2_row_b2.txt',triple_mat,fmt='%.2f')

In [10]:
%%time
savepath='./data-files/'
loadpath='./data-files/Jester_2.mat'
if not os.path.isdir(savepath):
    os.mkdir(savepath)
generate_data(savepath,loadpath)

Wall time: 4.8 s


[Naive Method](#1.-Naive-Method)

[Sampling Method](#2.-Sampling-Method-(DIMSUM))

#### Note: Execution should be carried out in sequence because of cross-referencing

 > RDD - these are the elements that run and operate on multiple nodes to do parallel processing on a cluster

## 1. Naive Method

#### with numpy

In [11]:
def map_func(ar):
    ar = np.array(list(ar))
    return ar.reshape(-1,1)*ar

In [12]:
def fold_func(a,b,n):
    a,b = np.array(a), np.array(b)
    return a+b

In [13]:
def calc_ATA_naive(mat):
    res = mat\
        .map(lambda x: map_func(x))\
        .fold(0,add)
    return res

#### without numpy

In [14]:
def map_wo_numpy(ar,n):
    ar = list(ar)
    l = [[0 for i in range(n)] for j in range(n)]
    for i in range(n):
        for j in range(n):
            l[i][j] = ar[i]*ar[j]
    return l

In [15]:
def fold_wo_numpy(a,b):
    a,b = list(a), list(b)
    n = len(a)
    for i in range(n):
        for j in range(n):
            a[i][j] += b[i][j]
    return a

In [16]:
def calc_ATA_naive_wo_numpy(mat):
    n = len(mat.first())
    res = mat\
        .map(lambda x: map_wo_numpy(x,n))\
        .fold([[0 for i in range(n)]for j in range(n)],fold_wo_numpy)
    return res

In [17]:
spark = SparkSession\
        .builder\
        .master("local[*]")\
        .appName("MatATA")\
        .getOrCreate()

In [18]:
sc = spark.sparkContext

In [19]:
# Preprocessing
def splitter(l):
    return [float(x) for x in l.strip().split(' ')]

In [20]:
mat_rdd = sc.textFile('./data-files/jester2_row.txt').map(lambda l: splitter(l))

Takes ~ **3 seconds**

In [21]:
%%time
res = calc_ATA_naive(mat_rdd)

Wall time: 2.5 s


Takes **~30 seconds**

In [32]:
%%time
res = calc_ATA_naive_wo_numpy(mat_rdd)

Wall time: 26.4 s


#### On the larger dataset

In [34]:
mat_rdd = sc.textFile('./data-files/jester2_row_b1.txt').map(lambda l: splitter(l))

Takes **~ 50 seconds**

In [35]:
%%time
res = calc_ATA_naive_wo_numpy(mat_rdd)

Wall time: 49.7 s


#### On an even larger dataset

In [46]:
mat_rdd = sc.textFile('./data-files/jester2_row_b2.txt').map(lambda l: splitter(l))

Takes ~ **3m 4sec**

In [47]:
%%time
res = calc_ATA_naive_wo_numpy(mat_rdd)

Wall time: 3min 4s


### Using mapPartition

#### References for using mapPartitions
> https://stackoverflow.com/questions/44222307/spark-rdd-default-number-of-partitions

> https://github.com/mahmoudparsian/pyspark-tutorial/tree/master/tutorial/map-partitions

> https://medium.com/parrot-prediction/partitioning-in-apache-spark-8134ad840b0

#### Here I've implemented the `without numpy` version only

In [147]:
def map_part(ar,n):
    l = [[0 for i in range(n)] for j in range(n)]
    for a in ar:
        for i in range(n):
            for j in range(n):
                l[i][j] += a[i]*a[j]
    
    yield l

In [148]:
def calc_map_part_naive_wo_numpy(mat):
    n = len(mat.first())
    zeroVal = [[0 for i in range(n)]for j in range(n)]
    res = mat\
        .mapPartitions(lambda x: map_part(x,n))\
        .fold(zeroVal, fold_wo_numpy)
    return res

> On my machine the total number of cores(physical+logical) = 12

In [130]:
mat_rdd = sc.textFile('./data-files/jester2_row.txt', 12).map(lambda l: splitter(l))

In [131]:
mat_rdd.getNumPartitions()

12

With 12 partitions: ~ **23 s**

In [149]:
%%time
res = calc_map_part_naive_wo_numpy(mat_rdd)

Wall time: 22.8 s


## 2. Sampling Method (DIMSUM)

- mapPartitions
- fold

**Caching RDDs** in Spark: It is one mechanism to speed up applications that access the same RDD multiple times. An RDD that is not cached, nor checkpointed, is re-evaluated again each time an action is invoked on that RDD. There are two function calls for caching an RDD: cache() and persist(level: StorageLevel). The difference among them is that cache() will cache the RDD into memory, whereas persist(level) can cache in memory, on disk, or off-heap memory according to the caching strategy specified by level.

- A good reference: https://stackoverflow.com/questions/28981359/why-do-we-need-to-call-cache-or-persist-on-a-rdd

#### with numpy

In [21]:
def multiplier(ar):
    ar = np.array(ar)
    return ar*ar

In [25]:
# pre-computing the magnitudes of each column-vector
def cols_magnitude(mat):
    res = mat\
        .map(lambda x: multiplier(x))\
        .fold(0, add) # with numpy operator.add works fine
    return res

#### without numpy

In [23]:
def multiplier_wo_numpy(ar):
    return [a*a for a in ar]

In [22]:
def add_arr(a,b):
    return [(ai+bi) for (ai,bi) in zip(a,b)]

In [26]:
# pre-computing the magnitudes of each column-vector
def cols_magnitude_wo_numpy(mat):
    n = len(mat.first())
    res = mat\
        .map(lambda x: multiplier_wo_numpy(x))\
        .fold([0 for i in range(n)], add_arr)
    return res

In [None]:
mat_rdd = sc.textFile('./data-files/jester2_row.txt').map(lambda l: splitter(l))

In [30]:
%%time
col_mags_numpy = cols_magnitude(mat_rdd)

Wall time: 1.46 s


In [31]:
%%time
col_mags_wo_numpy = cols_magnitude_wo_numpy(mat_rdd)

Wall time: 1.95 s


In [36]:
def dimsum_map_part(ar,n,gamma,mag_cols):
    l = [[0 for i in range(n)] for j in range(n)]
    for a in ar:
        for i in range(n):
            if random() < (gamma/mag_cols[i]):
                for j in range(i+1,n):
                    temp = a[i]*a[j]
                    l[i][j] += temp
                    l[j][i] += temp
    yield l

In [40]:
dimsum_fold = fold_wo_numpy

In [60]:
def dimsum_wo_numpy(mat, mag_cols, gamma, n_cols):
    # gamma generally 2*log2(n)
    if gamma<1:
        return "Error: Please provide gamma > 1"
    
    n = n_cols
    zeroVal = [[0 for i in range(n)]for j in range(n)]
    res = mat\
        .mapPartitions(lambda x: dimsum_map_part(x,n,gamma,mag_cols))\
        .fold(zeroVal, dimsum_fold)
    
    # Now we restore what we normalized using gamma and col_mags  
    for i in range(n):
        for j in range(i,n):
            if i==j:
                res[i][j] = mag_cols[i]
            elif (gamma / mag_cols[i]) < 1:
                res[i][j] *= (mag_cols[i]/ gamma)
            res[j][i] = res[i][j]
    return res

In [43]:
n_cols = len(mat_rdd.first())

In [48]:
gamma = 2*log2(n_cols)

Takes only ~ **2 sec**

In [61]:
%%time
res = dimsum_wo_numpy(mat_rdd, col_mags_wo_numpy, gamma, n_cols)

Wall time: 1.53 s


#### On the larger dataset

In [69]:
mat_rdd = sc.textFile('./data-files/jester2_row_b1.txt', 12).map(lambda l: splitter(l))

In [71]:
n_cols = len(mat_rdd.first())

In [72]:
gamma = 2*log2(n_cols)

In [70]:
%%time
col_mags_wo_numpy = cols_magnitude_wo_numpy(mat_rdd)

Wall time: 16.8 s


In [73]:
%%time
res = dimsum_wo_numpy(mat_rdd, col_mags_wo_numpy, gamma, n_cols)

Wall time: 15.5 s


 #### On an even larger dataset

In [74]:
mat_rdd = sc.textFile('./data-files/jester2_row_b2.txt', 12).map(lambda l: splitter(l))

In [75]:
n_cols = len(mat_rdd.first())

In [76]:
gamma = 2*log2(n_cols)

In [77]:
%%time
col_mags_wo_numpy = cols_magnitude_wo_numpy(mat_rdd)

Wall time: 17.1 s


In [78]:
%%time
res = dimsum_wo_numpy(mat_rdd, col_mags_wo_numpy, gamma, n_cols)

Wall time: 15.6 s


In [65]:
spark.stop()