## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of forward feature selection

In [38]:
import numpy as np
import pwd

import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum


from sklearn import linear_model
import sklearn.metrics as sm

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
from sklearn import datasets
from sklearn import model_selection

userName = getpass.getuser()
appName = pwd.getpwuid( os.getuid() )[ 0 ]


# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

## Dataset generation


In [88]:


n=10 # number of features
N=5000 # number of samples

relf=[1,2,3,n-1]  ## relevant features
nrel= len(relf)  ## number of relevant  feature

X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
#X=np.concatenate((np.ones((N,1)),X), axis=1)

Y=np.zeros((N,1))
for j in np.arange(nrel):
    Y=Y+X[:,relf[j]].reshape(N, 1)
    Y=Y.reshape(N, 1)
    
    
Y=Y+np.random.normal(loc=0, scale=0.1, size=N ).reshape(N, 1)
Y=Y.reshape(N, 1)


XY = np.concatenate((X,Y), axis=1)
dataset=sc.parallelize(XY)


## Memory-resident least-squares implementation

In [90]:
def error(X,y):
    # returns training error for a least-squares model
    linear_regressor = linear_model.LinearRegression()

    # Train the model using the training sets
    linear_regressor.fit(X, y)
    ypred = linear_regressor.predict(X)
    
    return np.mean((pow(y-ypred,2.0)))

## Map function x-> (col, ([x[col],x[selected],x[-1]))

In [91]:
def mapf(x):
    l=len(x)
    s=len(bselected.value)
    # map has access to the brodcasted set of selected features
    outm=[]
    for i in range(l-1):
        if (not i in bselected.value):
            if s>0:
                outm=outm+[(i, (x[[i]+bselected.value+[l-1]]))]
            else:
                outm=outm+[(i, (x[[i]+[l-1]]))]
                ## concatenates the values of the considered feature and the selected features
    return(outm)

## Map outcome at the first step of forward selection¶

In [92]:
selected=[]
# At the beginning selected is empty

bselected = sc.broadcast(selected)
# it broadcasts the set of already selected features 
D=dataset.flatMap(lambda x: mapf(x)).reduceByKey(lambda a,b: np.vstack((a,b)))
# for each non selected feature, it returns both the input and the output
D.collect()

[(0,
  array([[ 0.62217615, -0.29790529],
         [ 0.18264289, -2.39621927],
         [-1.06972959, -3.09837015],
         ...,
         [-0.27838575,  2.1313603 ],
         [-0.64583693, -1.5195167 ],
         [-0.28389813,  2.14365314]])),
 (1,
  array([[-1.2285351 , -0.29790529],
         [-0.26289747, -2.39621927],
         [-0.7619549 , -3.09837015],
         ...,
         [-0.44766073,  2.1313603 ],
         [-0.39661999, -1.5195167 ],
         [-0.51123439,  2.14365314]])),
 (2,
  array([[ 0.02305261, -0.29790529],
         [-0.80130621, -2.39621927],
         [ 0.28742134, -3.09837015],
         ...,
         [ 1.23514176,  2.1313603 ],
         [-1.81940532, -1.5195167 ],
         [ 0.66705138,  2.14365314]])),
 (3,
  array([[ 0.04260902, -0.29790529],
         [-1.24882258, -2.39621927],
         [-1.84107168, -3.09837015],
         ...,
         [ 0.1594543 ,  2.1313603 ],
         [ 0.47385405, -1.5195167 ],
         [ 0.86671215,  2.14365314]])),
 (4,
  array([[ 1.760598

In [93]:
# it computes the error for each considered feature
# error function takes as input A (in this case a single column) and y 
D.reduceByKey(lambda a,b: np.vstack((a,b))).map(lambda x: (x[0],error(x[1][:,:-1],x[1][:,-1]))).collect()

[(0, 4.059404191093759),
 (1, 3.0546615465392186),
 (2, 3.0507307206717473),
 (3, 3.0098937215906507),
 (4, 4.062630349169577),
 (5, 4.062710120202222),
 (6, 4.062842540144609),
 (7, 4.061593466573133),
 (8, 4.061778139609421),
 (9, 3.008477039921329)]

### Map outcome at the second step of forward selection¶

In [94]:
selected=[1]
#suppose that the first selected variable is the number 1

bselected = sc.broadcast(selected)
# it broadcasts the set of already selected featuresD=dataset.flatMap(lambda x: mapf(x))
D=dataset.flatMap(lambda x: mapf(x)).reduceByKey(lambda a,b: np.vstack((a,b)))
# for each non selected feature, it returns both the input and the output
D.collect()

[(0,
  array([[ 0.62217615, -1.2285351 , -0.29790529],
         [ 0.18264289, -0.26289747, -2.39621927],
         [-1.06972959, -0.7619549 , -3.09837015],
         ...,
         [-0.27838575, -0.44766073,  2.1313603 ],
         [-0.64583693, -0.39661999, -1.5195167 ],
         [-0.28389813, -0.51123439,  2.14365314]])),
 (2,
  array([[ 0.02305261, -1.2285351 , -0.29790529],
         [-0.80130621, -0.26289747, -2.39621927],
         [ 0.28742134, -0.7619549 , -3.09837015],
         ...,
         [ 1.23514176, -0.44766073,  2.1313603 ],
         [-1.81940532, -0.39661999, -1.5195167 ],
         [ 0.66705138, -0.51123439,  2.14365314]])),
 (3,
  array([[ 0.04260902, -1.2285351 , -0.29790529],
         [-1.24882258, -0.26289747, -2.39621927],
         [-1.84107168, -0.7619549 , -3.09837015],
         ...,
         [ 0.1594543 , -0.44766073,  2.1313603 ],
         [ 0.47385405, -0.39661999, -1.5195167 ],
         [ 0.86671215, -0.51123439,  2.14365314]])),
 (4,
  array([[ 1.76059874, -1.228

## Forward selection¶

In [95]:
selected=[]
bselected = sc.broadcast(selected)
# it broadcasts the set of already selected features


for j in range(len(relf)):
    D=dataset.flatMap(lambda x: mapf(x))
    ERR=D.reduceByKey(lambda a,b: np.vstack((a,b))).map(lambda x: (x[0],error(x[1][:,:-1],x[1][:,-1]))).collect()
    bestfs=0
    bestErr=100
    for i in range(len(ERR)):
        if ERR[i][1]<bestErr:
            bestErr=ERR[i][1]
            bestfs=ERR[i][0]
    print(bestErr)
    selected=selected+[bestfs]
    bselected = sc.broadcast(selected)
    print(selected)

3.008477039921329
[9]
2.006207474318531
[9, 1]
0.9951824710994884
[9, 1, 3]
0.010072479233388379
[9, 1, 3, 2]


In [97]:
print("Relevant=", np.sort(relf), "Selected=",np.sort(selected))

Relevant= [1 2 3 9] Selected= [1 2 3 9]
