## INFOH515 Pyspark code
## Author: Gianluca Bontempi
## Pyspark implementation of forward feature selection

In [4]:
import numpy as np
import pwd

import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum


from sklearn import linear_model
import sklearn.metrics as sm

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
from sklearn import datasets
from sklearn import model_selection



# create an instance of SparkSession
spark=SparkSession.builder.appName('s.com').getOrCreate()
sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/14 17:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/14 17:12:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Dataset generation


In [5]:


n=10 # number of features
N=5000 # number of samples

relf=[1,2,3,n-1]  ## relevant features
nrel= len(relf)  ## number of relevant  feature

X= np.random.normal(loc=0, scale=1, size=N * n).reshape(N, n)
#X=np.concatenate((np.ones((N,1)),X), axis=1)

Y=np.zeros((N,1))
for j in np.arange(nrel):
    Y=Y+X[:,relf[j]].reshape(N, 1)
    Y=Y.reshape(N, 1)
    
    
Y=Y+np.random.normal(loc=0, scale=0.1, size=N ).reshape(N, 1)
Y=Y.reshape(N, 1)


XY = np.concatenate((X,Y), axis=1)
dataset=sc.parallelize(XY)


## Memory-resident least-squares implementation

In [6]:
def error(X,y):
    # returns training error for a least-squares model
    linear_regressor = linear_model.LinearRegression()

    # Train the model using the training sets
    linear_regressor.fit(X, y)
    ypred = linear_regressor.predict(X)
    
    return np.mean((pow(y-ypred,2.0)))

## Map function x-> (col, ([x[col],x[selected],x[-1]))

In [7]:
def mapf(x):
    l=len(x)
    s=len(bselected.value)
    # map has access to the brodcasted set of selected features
    outm=[]
    for i in range(l-1):
        if (not i in bselected.value):
            if s>0:
                outm=outm+[(i, (x[[i]+bselected.value+[l-1]]))]
            else:
                outm=outm+[(i, (x[[i]+[l-1]]))]
                ## concatenates the values of the considered feature and the selected features
    return(outm)

## Map outcome at the first step of forward selection¶

In [8]:
selected=[]
# At the beginning selected is empty

bselected = sc.broadcast(selected)
# it broadcasts the set of already selected features 
D=dataset.flatMap(lambda x: mapf(x)).reduceByKey(lambda a,b: np.vstack((a,b)))
# for each non selected feature, it returns both the input and the output
D.collect()

                                                                                

[(0,
  array([[-0.16752656,  0.27600833],
         [ 0.79286448,  1.12030563],
         [-0.57164512,  0.92643661],
         ...,
         [ 0.18651379,  0.2022259 ],
         [-0.55820718,  2.29584986],
         [ 1.33007707, -0.00266216]])),
 (1,
  array([[ 0.81656261,  0.27600833],
         [ 0.30634826,  1.12030563],
         [ 0.45346242,  0.92643661],
         ...,
         [-0.46662729,  0.2022259 ],
         [-0.6131301 ,  2.29584986],
         [ 0.87580367, -0.00266216]])),
 (2,
  array([[-0.6955622 ,  0.27600833],
         [-1.05745239,  1.12030563],
         [-1.34450672,  0.92643661],
         ...,
         [-0.68215537,  0.2022259 ],
         [ 1.83916175,  2.29584986],
         [ 0.16386823, -0.00266216]])),
 (3,
  array([[ 0.6412275 ,  0.27600833],
         [ 1.45289494,  1.12030563],
         [ 0.98463324,  0.92643661],
         ...,
         [ 1.57342108,  0.2022259 ],
         [ 0.89621061,  2.29584986],
         [-2.22792634, -0.00266216]])),
 (4,
  array([[-0.357882

In [9]:
# it computes the error for each considered feature
# error function takes as input A (in this case a single column) and y 
D.reduceByKey(lambda a,b: np.vstack((a,b))).map(lambda x: (x[0],error(x[1][:,:-1],x[1][:,-1]))).collect()

                                                                                

[(0, 4.022878565550321),
 (1, 3.0011607533144233),
 (2, 3.0286937665383356),
 (3, 2.9988915428515988),
 (4, 4.022604555143927),
 (5, 4.022876286587886),
 (6, 4.022884832816399),
 (7, 4.022800934931382),
 (8, 4.021801260260533),
 (9, 3.0164124836593116)]

### Map outcome at the second step of forward selection¶

In [10]:
selected=[1]
#suppose that the first selected variable is the number 1

bselected = sc.broadcast(selected)
# it broadcasts the set of already selected featuresD=dataset.flatMap(lambda x: mapf(x))
D=dataset.flatMap(lambda x: mapf(x)).reduceByKey(lambda a,b: np.vstack((a,b)))
# for each non selected feature, it returns both the input and the output
D.collect()

[(0,
  array([[-0.16752656,  0.81656261,  0.27600833],
         [ 0.79286448,  0.30634826,  1.12030563],
         [-0.57164512,  0.45346242,  0.92643661],
         ...,
         [ 0.18651379, -0.46662729,  0.2022259 ],
         [-0.55820718, -0.6131301 ,  2.29584986],
         [ 1.33007707,  0.87580367, -0.00266216]])),
 (2,
  array([[-0.6955622 ,  0.81656261,  0.27600833],
         [-1.05745239,  0.30634826,  1.12030563],
         [-1.34450672,  0.45346242,  0.92643661],
         ...,
         [-0.68215537, -0.46662729,  0.2022259 ],
         [ 1.83916175, -0.6131301 ,  2.29584986],
         [ 0.16386823,  0.87580367, -0.00266216]])),
 (3,
  array([[ 0.6412275 ,  0.81656261,  0.27600833],
         [ 1.45289494,  0.30634826,  1.12030563],
         [ 0.98463324,  0.45346242,  0.92643661],
         ...,
         [ 1.57342108, -0.46662729,  0.2022259 ],
         [ 0.89621061, -0.6131301 ,  2.29584986],
         [-2.22792634,  0.87580367, -0.00266216]])),
 (4,
  array([[-0.35788217,  0.816

## Forward selection¶

In [11]:
selected=[]
bselected = sc.broadcast(selected)
# it broadcasts the set of already selected features


for j in range(len(relf)):
    D=dataset.flatMap(lambda x: mapf(x))
    ERR=D.reduceByKey(lambda a,b: np.vstack((a,b))).map(lambda x: (x[0],error(x[1][:,:-1],x[1][:,-1]))).collect()
    bestfs=0
    bestErr=100
    for i in range(len(ERR)):
        if ERR[i][1]<bestErr:
            bestErr=ERR[i][1]
            bestfs=ERR[i][0]
    print(bestErr)
    selected=selected+[bestfs]
    bselected = sc.broadcast(selected)
    print(selected)

2.9988915428515988
[3]
1.9718258612497392
[3, 1]
0.9837110312756228
[3, 1, 9]
0.009649939943486672
[3, 1, 9, 2]


## Check of the quality of the selected features

In [104]:
print("Relevant=", np.sort(relf), "Selected=",np.sort(selected))

Relevant= [1 2 3 9] Selected= [1 2 3 9]
