In [1]:
from operator import add

from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import RFormula
from pyspark.mllib.stat import Statistics
from pyspark.sql import Row
from pyspark.sql.types import *

from feature_selection.univariate import SelectKBest

# Functions

# Load test data

In [2]:
# Load data
wwine_rdd = sc.textFile("wine-data/winequality-white.csv")
wwine_rdd = wwine_rdd.map(lambda line: line.replace('"',''))
wwine_rdd = wwine_rdd.map(lambda line: line.replace(';',','))

In [3]:
# Get header
header_str = wwine_rdd.first()
header = header_str.split(',')
header = map(lambda h: h.replace(' ','_'),header)
Wine = Row(*header)


In [4]:
# Create colection of Rows
data = wwine_rdd.filter(lambda x: x!=header_str)
data = data.map(lambda w: w.split(','))
data = data.map(lambda w: map(float,w))
#data = data.map(lambda w: Wine(*w))

# Create DataFrame
df = sqlContext.createDataFrame(data, schema=header)
df.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: double (nullable = true)



# Univariate selection

In [5]:
feature_cols = ['fixed_acidity',
                'volatile_acidity',
                'citric_acid',
                'residual_sugar',
                'chlorides',
                'free_sulfur_dioxide',
                'total_sulfur_dioxide',
                'density',
                'pH',
                'sulphates',
                'alcohol']
target_col = 'quality'

kb = SelectKBest(k=3, method='fscore')
out_df = kb.transform(df, featureCols=feature_cols, targetCol=target_col)

In [22]:
out_df.show(n=5)

+-------+-------+----------------+
|alcohol|density|volatile_acidity|
+-------+-------+----------------+
|    8.8|  1.001|            0.27|
|    9.5|  0.994|             0.3|
|   10.1| 0.9951|            0.28|
|    9.9| 0.9956|            0.23|
|    9.9| 0.9956|            0.23|
+-------+-------+----------------+
only showing top 5 rows



### sklearn check

In [10]:
# test with scikit-learn
from sklearn.feature_selection import SelectKBest, f_classif

dp = df.toPandas()
p_kb = SelectKBest(score_func=f_classif, k=3)
p_kb.fit(dp[dp.columns[:-1]],dp[dp.columns[-1]])

SelectKBest(k=3, score_func=<function f_classif at 0x7f12632e2848>)

In [11]:
p_kb.transform(dp[dp.columns[:-1]])

array([[  0.27   ,   1.001  ,   8.8    ],
       [  0.3    ,   0.994  ,   9.5    ],
       [  0.28   ,   0.9951 ,  10.1    ],
       ..., 
       [  0.24   ,   0.99254,   9.4    ],
       [  0.29   ,   0.98869,  12.8    ],
       [  0.21   ,   0.98941,  11.8    ]])

In [21]:
# check two results are equal
assert set([round(x) for x in p_kb.scores_.tolist()]) ^ set([round(x) for x in kb.scores_]) == set()