## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of the model selection algorithm in the INFOH515 slides "Map-reduce analytics" 

In [30]:
import numpy as np
import pwd
import getpass
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum
from pyspark.mllib.tree import RandomForest, RandomForestModel
from sklearn import linear_model
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

In [40]:

np.random.seed(1225)   


Ntr=1500
Nts=100
N=Ntr+Nts
n=5
npartitions=5


X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
Y=2+(X[:,0]**2)-3*X[:,n-1]+np.random.normal(loc=0, scale=0.1, size=N )
Y=Y.reshape(N, 1)


Xtr=X[:Ntr,]
Ytr=Y[:Ntr,]
Xts=X[Ntr:,]
Yts=Y[Ntr:,]
Ytr.shape=(Ntr,1)
YX=np.hstack((Ytr,Xtr))
vYts=np.var(Yts)
YXrdd=sc.parallelize(YX,npartitions)

YXrdd.take(2)


[array([ 6.32529461,  1.40410217,  1.84505492,  1.27428404, -1.15801998,
        -0.77114336]),
 array([ 4.32475901,  0.34615021, -1.58390158, -0.47663391,  0.67023779,
        -0.6931159 ])]

The dataset is split into npartitions and the same learning algorithm is applied to each partition.
The test error of the learning algorithm trained on the first partition is compared with the test error of the averaging approach.

In [41]:

def rddCreateModels(iterator,modlist):
    listm=[]
    data=np.array(list(iterator))
    X = data[:, 1:]
    Y=data[:, 0]
    for mod in modlist:     
        #data=np.array(list(iterator))
        m=mod.fit(X,Y)  
        listm=listm+[m]
        
    return listm
def rddUseModel(iterator,Xts):
    rfit=list(iterator)[0]
    #return [pow(Yts-rfit.predict(Xts),2)]
    return [rfit.predict(Xts)]

def rddApplyMean(D,axis=0):
    if (axis==0): # column
        N=D.count()
        return(D.reduce(lambda x,y:x+y)/N)

    if (axis==1): #row
        return(rddArr(D.map(lambda x:mean(x))))
    

ncores=-1

mregr0= DecisionTreeRegressor(max_depth=15)
mregr1= RandomForestRegressor(n_estimators=2*nT,max_depth=10,n_jobs=ncores)
mregr2= AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=100)
mregr3= KNeighborsRegressor(n_neighbors=3)

M2=YXrdd.mapPartitions(lambda x: rddCreateModels(x,[mregr0,mregr1,mregr2,mregr3]))



## Prediction of the candidate models

In [42]:
def rddUseModel(iterator,Xts):
    listpred=[]
    rfit=list(iterator)
    cnt=0
    for r in rfit:
        pred=r.predict(Xts)
        listpred=listpred+[(cnt,pred)]
        cnt=cnt+1
    return listpred

Yhat1=rddApplyMean(M2.mapPartitions(lambda x: rddUseModel(x,Xts)).filter(lambda x: x[0]==0).map(lambda x: x[1]))
Yhat2=rddApplyMean(M2.mapPartitions(lambda x: rddUseModel(x,Xts)).filter(lambda x: x[0]==1).map(lambda x: x[1]))
Yhat3=rddApplyMean(M2.mapPartitions(lambda x: rddUseModel(x,Xts)).filter(lambda x: x[0]==2).map(lambda x: x[1]))
Yhat4=rddApplyMean(M2.mapPartitions(lambda x: rddUseModel(x,Xts)).filter(lambda x: x[0]==3).map(lambda x: x[1]))
errhat1=Yts.ravel()-Yhat1.ravel()
print("NMSE model 1=", np.mean(pow(errhat1,2))/vYts)

errhat2=Yts.ravel()-Yhat2.ravel()
print("NMSE model 2=",np.mean(pow(errhat2,2))/vYts)

errhat3=Yts.ravel()-Yhat3.ravel()
print("NMSE model 3=",np.mean(pow(errhat3,2))/vYts)


errhat4=Yts.ravel()-Yhat4.ravel()
print("NMSE model 4=",np.mean(pow(errhat4,2))/vYts)


                                                                                

NMSE model 1= 0.017948071981141556
NMSE model 2= 0.026728533521305078
NMSE model 3= 0.06749214982627325
NMSE model 4= 0.09283371549791214
