In [1]:
sc

<pyspark.context.SparkContext at 0x7f4ead9c5790>

In [6]:
!rm -rf metastore_db/*.lck
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Feature Vectors

### Vector Assembler

In [7]:
from collections import namedtuple

Customer = namedtuple('Customer', ['churn','sessions','revenue','recency'])

customers = sc.parallelize([Customer(1, 20, 61.24, 103),
                            Customer(1, 8, 80.64, 23),
                            Customer(0, 4, 100.94, 42),
                            Customer(0, 8, 99.48, 26),
                            Customer(1, 17, 120.56, 47)]).toDF()

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler().setInputCols(["sessions", "revenue", "recency"]).setOutputCol("features")
dfWithFeatures = assembler.transform(customers)

In [10]:
dfWithFeatures.show()

+-----+--------+-------+-------+------------------+
|churn|sessions|revenue|recency|          features|
+-----+--------+-------+-------+------------------+
|    1|      20|  61.24|    103|[20.0,61.24,103.0]|
|    1|       8|  80.64|     23|  [8.0,80.64,23.0]|
|    0|       4| 100.94|     42| [4.0,100.94,42.0]|
|    0|       8|  99.48|     26|  [8.0,99.48,26.0]|
|    1|      17| 120.56|     47|[17.0,120.56,47.0]|
+-----+--------+-------+-------+------------------+



### Vector Slicer

In [11]:
from pyspark.ml.feature import VectorSlicer
slicer = VectorSlicer().setInputCol("features").setOutputCol("some_features")

In [12]:
slicer.setIndices([0, 1]).transform(dfWithFeatures).show()

+-----+--------+-------+-------+------------------+-------------+
|churn|sessions|revenue|recency|          features|some_features|
+-----+--------+-------+-------+------------------+-------------+
|    1|      20|  61.24|    103|[20.0,61.24,103.0]| [20.0,61.24]|
|    1|       8|  80.64|     23|  [8.0,80.64,23.0]|  [8.0,80.64]|
|    0|       4| 100.94|     42| [4.0,100.94,42.0]| [4.0,100.94]|
|    0|       8|  99.48|     26|  [8.0,99.48,26.0]|  [8.0,99.48]|
|    1|      17| 120.56|     47|[17.0,120.56,47.0]|[17.0,120.56]|
+-----+--------+-------+-------+------------------+-------------+

