## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the correlation  in the INFOH515 slides "Map-reduce analytics" 

In [1]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum
from pyspark.mllib.tree import RandomForest, RandomForestModel
from sklearn import linear_model
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/17 17:02:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/17 17:02:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/03/17 17:02:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:

np.random.seed(1225)   
sdn=np.random.uniform(0.15,0.25)
n=10# number of features
N=20 # number of samples
p=5
mean=np.zeros(n)
meanY=np.zeros(p)
X=np.random.multivariate_normal(mean, np.identity(n), N)
Y=np.random.multivariate_normal(meanY, np.identity(p), n)

In [3]:
rddX=sc.parallelize(X)


In [6]:
def corrsplit(x):
    n=len(x)
    L=[]
    for i in np.arange(n):
        for j in np.arange(i+1,n):
                L=L+[((i,j),(x[i],x[j]))]
    return(L)
    # list of key values ((i,j),(x[i],x[j]))

def corcomp(x):
    L=list(x[1])
    x1=[item[0] for item in L]
    x2=[item[1] for item in L]
    return(np.corrcoef(x1,x2)[0,1])

In [18]:
rddCor=rddX.flatMap(lambda x: corrsplit(x)).groupByKey().map(lambda x: [x[0],corcomp(x)]).sortByKey()
rddCor.take(9)

[((0, 1), 0.061721884015570924),
 ((0, 2), -0.35742296416076724),
 ((0, 3), 0.13456538786517377),
 ((0, 4), 0.1915174429870057),
 ((0, 5), 0.14800657328727776),
 ((0, 6), 0.2645638418249283),
 ((0, 7), 0.06120226010846439),
 ((0, 8), 0.22306203346834316),
 ((0, 9), -0.28135686974426843)]

### Verification of the resulting by comparing with an in-memory function

In [17]:
for i in np.arange(1,n):
    print("(0,",i,"),",np.corrcoef(X[:,0],X[:,i])[0,1])
# check the result

(0, 1 ), 0.061721884015570924
(0, 2 ), -0.35742296416076724
(0, 3 ), 0.13456538786517377
(0, 4 ), 0.1915174429870057
(0, 5 ), 0.14800657328727776
(0, 6 ), 0.2645638418249283
(0, 7 ), 0.06120226010846439
(0, 8 ), 0.22306203346834316
(0, 9 ), -0.28135686974426843


## Computation $X^T Y$

In [None]:

p=5

meanY=np.zeros(p)

Y=np.random.multivariate_normal(meanY, np.identity(p), n)

rddY=sc.parallelize(Y)


In [27]:
def splitX(x):   
    row=x[0]
    x=x[1]
    n=len(x)
    L=[]
    for j in np.arange(n):
        L=L+[(j,(row,x[j]))]
    return(L)
    # list of key values ((j,(i,x[i,j]))

def splitY(y):   
    row=y[0]
    y=y[1]
    n=len(y)
    L=[]
    for k in np.arange(n):
        L=L+[(row,(k,y[k]))]
    return(L)
      # list of key values ((j,(k,y[j,k]))
    


X1=rddX.zipWithIndex().map(lambda x: (x[1],x[0])).flatMap(lambda x: splitX(x)).sortByKey()
X1.collect()
Y1=rddY.zipWithIndex().map(lambda x: (x[1],x[0])).flatMap(lambda y: splitY(y))
Y1.first()
def combine(x):
    key=x[0]
    listx=list(x[1][0])
    listy=list(x[1][1])
    nx=len(listx)
    ny=len(listy)
    L=[]
    for lx in np.arange(nx):
        for ly in np.arange(ny):
            #L=L+[(key,(listx[lx][0],listy[ly][0], listx[lx][1]*listy[ly][1]))]
            L=L+[((listx[lx][0],listy[ly][0]), listx[lx][1]*listy[ly][1])]
    return(L)

## (j,(i,x[i,j])) (j,(k,y[j,k]))
## group and make ((i,k), x[i,j]*y[j,k])
## group and sum all (i,k)
X1.groupWith(Y1).flatMap(lambda x:combine(x)).reduceByKey(lambda x,y:x+y).sortByKey().take(10)

                                                                                

[((0, 0), -1.8426689425175622),
 ((0, 1), -1.0177298985236614),
 ((0, 2), -1.7227970320364898),
 ((0, 3), -0.27668620350378625),
 ((0, 4), -1.1221315070031475),
 ((1, 0), -1.6453847403345523),
 ((1, 1), 0.15123482549006195),
 ((1, 2), -0.3799336826977642),
 ((1, 3), 1.112248214027636),
 ((1, 4), 0.11255788462816119)]

### Verification of the resulting by comparing with an in-memory function

In [41]:
np.dot(X,Y)

array([[-1.84266894, -1.0177299 , -1.72279703, -0.2766862 , -1.12213151],
       [-1.64538474,  0.15123483, -0.37993368,  1.11224821,  0.11255788],
       [ 2.61158727, -4.79120644,  3.41997853,  1.63344031, -7.70307135],
       [-0.62008517,  1.10180426,  0.21125909,  1.31295986,  5.47441629],
       [ 5.03583958, -3.89114357, -4.33563491,  1.97492233,  2.95800165],
       [ 3.95091408, -3.44837428,  4.98728794, -4.09511504, -5.22500791],
       [ 1.72643668, -3.78699367, -3.88245399,  4.19031368, -1.28915313],
       [ 2.37086386,  0.42593202, -0.94177944,  2.07741272,  3.17182894],
       [ 1.44214637,  2.33556705, -3.42683677, -4.90994847,  0.97654087],
       [ 3.31005729,  0.20398676,  6.54961344, -3.1342704 ,  1.79568593],
       [-4.8523059 ,  2.91073198,  1.21234343,  0.48011586,  1.36079966],
       [ 2.01794444,  0.34007773,  2.88162247,  0.70352363,  0.31269358],
       [ 1.30273713,  0.86475189,  3.07815784, -2.46123637,  0.75800103],
       [-1.71776693,  1.54305482, -2.2