## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the bagging algorithm in the INFOH515 slides "Map-reduce analytics" 

In [1]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum
from pyspark.mllib.tree import RandomForest, RandomForestModel
from sklearn import linear_model
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/17 16:56:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:

np.random.seed(1225)   


Ntr=1500
Nts=100
N=Ntr+Nts
n=10
npartitions=5


X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
Y=2+(X[:,0]**2)-3*X[:,7]+np.random.normal(loc=0, scale=0.1, size=N )
Y=Y.reshape(N, 1)


Xtr=X[:Ntr,]
Ytr=Y[:Ntr,]
Xts=X[Ntr:,]
Yts=Y[Ntr:,]
Ytr.shape=(Ntr,1)
YX=np.hstack((Ytr,Xtr))
vYts=np.var(Yts)
YXrdd=sc.parallelize(YX,npartitions)

YXrdd.take(2)


[array([ 5.61584486,  1.40410217,  1.84505492,  1.27428404, -1.15801998,
        -0.77114336,  0.34615021, -1.58390158, -0.47663391,  0.67023779,
        -0.6931159 ]),
 array([ 3.93318275, -1.2784718 ,  1.22773916,  0.6629516 ,  0.26926281,
         0.30634199, -0.15450905,  0.77881069, -0.09776944,  0.39046338,
        -0.55891276])]

The dataset is split into npartitions and the same learning algorithm is applied to each partition.
The test error of the learning algorithm trained on the first partition is compared with the test error of the averaging approach.

In [11]:

def rddCreateModels(iterator,mod):

    data=np.array(list(iterator))
    print(data.shape)
    X = data[:, 1:]
    Y=data[:, 0]
    m=mod.fit(X,Y)  
    return [m]

def rddUseModel(iterator,Xts):
    rfit=list(iterator)[0]
    #return [pow(Yts-rfit.predict(Xts),2)]
    return [rfit.predict(Xts)]

def rddApplyMean(D,axis=0):
    if (axis==0): # column
        N=D.count()
        return(D.reduce(lambda x,y:x+y)/N)

    if (axis==1): #row
        return(rddArr(D.map(lambda x:mean(x))))
    
nT=1000
mD=15
ncores=3

mregr0= RandomForestRegressor(n_estimators=nT,max_depth=mD,n_jobs=ncores)
M=YXrdd.mapPartitions(lambda x: rddCreateModels(x,mregr0))
M.count()


(300, 11)
(300, 11)
(300, 11)
(300, 11)
(300, 11)>                                                          (0 + 5) / 5]
                                                                                

5

## Prediction using only the first partition as training set

In [12]:
Yhat1=M.mapPartitions(lambda x: rddUseModel(x,Xts)).first()
errhat=Yts.reshape(Nts,1)-Yhat1.reshape(Nts,1)

NMSE1=np.mean(pow(errhat,2))/vYts

print("NMSE1=", NMSE1)

(300, 11)
                                                                                

0.05293836183324284

## Prediction by averaging over all the models (one per partition)

In [13]:
Yhats=rddApplyMean(M.mapPartitions(lambda x: rddUseModel(x,Xts)))
errhat=Yts.reshape(Nts,1)-Yhats.reshape(Nts,1)
NMSEav=np.mean(pow(errhat,2))/vYts

print("NMSEav=", NMSEav)

(300, 11)
(300, 11)
(300, 11)
(300, 11)
(300, 11)
(300, 11)                                                                       
(300, 11)
(300, 11)
(300, 11)
(300, 11)
                                                                                

0.042068564587990194