In [1]:
sc

<pyspark.context.SparkContext at 0x7feee903f7d0>

In [2]:
!rm -rf metastore_db/*.lck
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Feature Vectors

### Vector Assembler

In [3]:
from collections import namedtuple

Customer = namedtuple('Customer', ['churn','sessions','revenue','recency'])

customers = sc.parallelize([Customer(1, 20, 61.24, 103),
                            Customer(1, 8, 80.64, 23),
                            Customer(0, 4, 100.94, 42),
                            Customer(0, 8, 99.48, 26),
                            Customer(1, 17, 120.56, 47)]).toDF()

In [4]:
from pyspark.ml.feature import VectorAssembler

In [5]:
assembler = VectorAssembler().setInputCols(["sessions", "revenue", "recency"]).setOutputCol("features")
dfWithFeatures = assembler.transform(customers)

In [6]:
dfWithFeatures.show()

+-----+--------+-------+-------+------------------+
|churn|sessions|revenue|recency|          features|
+-----+--------+-------+-------+------------------+
|    1|      20|  61.24|    103|[20.0,61.24,103.0]|
|    1|       8|  80.64|     23|  [8.0,80.64,23.0]|
|    0|       4| 100.94|     42| [4.0,100.94,42.0]|
|    0|       8|  99.48|     26|  [8.0,99.48,26.0]|
|    1|      17| 120.56|     47|[17.0,120.56,47.0]|
+-----+--------+-------+-------+------------------+



### Vector Slicer

In [7]:
from pyspark.ml.feature import VectorSlicer
slicer = VectorSlicer().setInputCol("features").setOutputCol("some_features")

In [8]:
slicer.setIndices([0, 1]).transform(dfWithFeatures).show()

+-----+--------+-------+-------+------------------+-------------+
|churn|sessions|revenue|recency|          features|some_features|
+-----+--------+-------+-------+------------------+-------------+
|    1|      20|  61.24|    103|[20.0,61.24,103.0]| [20.0,61.24]|
|    1|       8|  80.64|     23|  [8.0,80.64,23.0]|  [8.0,80.64]|
|    0|       4| 100.94|     42| [4.0,100.94,42.0]| [4.0,100.94]|
|    0|       8|  99.48|     26|  [8.0,99.48,26.0]|  [8.0,99.48]|
|    1|      17| 120.56|     47|[17.0,120.56,47.0]|[17.0,120.56]|
+-----+--------+-------+-------+------------------+-------------+



In [9]:
sc

<pyspark.context.SparkContext at 0x7feee903f7d0>

In [10]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Categorical Features

In [11]:
df = sqlc.createDataFrame([(0, "US"), (1, "UK"), (2, "FR"),(3, "US"), (4, "US"), (5, "FR")]).toDF("id", "nationality")

### String Indexer

In [12]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer().setInputCol("nationality").setOutputCol("nIndex")

In [13]:
indexed = indexer.fit(df).transform(df)

In [14]:
indexed.show()

+---+-----------+------+
| id|nationality|nIndex|
+---+-----------+------+
|  0|         US|   0.0|
|  1|         UK|   2.0|
|  2|         FR|   1.0|
|  3|         US|   0.0|
|  4|         US|   0.0|
|  5|         FR|   1.0|
+---+-----------+------+



### IndexToString

In [15]:
from pyspark.ml.feature import IndexToString

converter = IndexToString().setInputCol("predictedIndex").setOutputCol("predictedNationality")

In [16]:
predictions = indexed.selectExpr("nIndex as predictedIndex")

In [17]:
converter.transform(predictions).show()

+--------------+--------------------+
|predictedIndex|predictedNationality|
+--------------+--------------------+
|           0.0|                  US|
|           2.0|                  UK|
|           1.0|                  FR|
|           0.0|                  US|
|           0.0|                  US|
|           1.0|                  FR|
+--------------+--------------------+



### OneHotEncoder

In [18]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder().setInputCol("nIndex").setOutputCol("nVector")

In [19]:
encoded = encoder.transform(indexed)

In [20]:
encoded.show()

+---+-----------+------+-------------+
| id|nationality|nIndex|      nVector|
+---+-----------+------+-------------+
|  0|         US|   0.0|(2,[0],[1.0])|
|  1|         UK|   2.0|    (2,[],[])|
|  2|         FR|   1.0|(2,[1],[1.0])|
|  3|         US|   0.0|(2,[0],[1.0])|
|  4|         US|   0.0|(2,[0],[1.0])|
|  5|         FR|   1.0|(2,[1],[1.0])|
+---+-----------+------+-------------+



In [21]:
encoder = OneHotEncoder().setInputCol("nIndex").setOutputCol("nVector").setDropLast(False)

In [22]:
encoded = encoder.transform(indexed)

In [23]:
encoded.show()

+---+-----------+------+-------------+
| id|nationality|nIndex|      nVector|
+---+-----------+------+-------------+
|  0|         US|   0.0|(3,[0],[1.0])|
|  1|         UK|   2.0|(3,[2],[1.0])|
|  2|         FR|   1.0|(3,[1],[1.0])|
|  3|         US|   0.0|(3,[0],[1.0])|
|  4|         US|   0.0|(3,[0],[1.0])|
|  5|         FR|   1.0|(3,[1],[1.0])|
+---+-----------+------+-------------+



In [24]:
sc

<pyspark.context.SparkContext at 0x7feee903f7d0>

In [25]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## PCA

In [26]:
!wget https://s3.eu-central-1.amazonaws.com/dsr-data/UScrime/UScrime2-colsLotsOfNAremoved.csv

crimes = sqlc.read.format("com.databricks.spark.csv") \
            .option("delimiter", ",") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load("UScrime2-colsLotsOfNAremoved.csv")

--2017-01-24 12:39:53--  https://s3.eu-central-1.amazonaws.com/dsr-data/UScrime/UScrime2-colsLotsOfNAremoved.csv
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 54.231.193.41
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|54.231.193.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 971758 (949K) [application/octet-stream]
Saving to: ‘UScrime2-colsLotsOfNAremoved.csv’


2017-01-24 12:39:53 (1,91 MB/s) - ‘UScrime2-colsLotsOfNAremoved.csv’ saved [971758/971758]



In [27]:
crimes = crimes.drop("OtherPerCap").drop("community")

In [28]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors, Vector

In [29]:
assembler = VectorAssembler().setInputCols(crimes.columns).setOutputCol("features")

In [30]:
featuresDF = assembler.transform(crimes).select("features")

In [31]:
from pyspark.ml.feature import PCA

pca = PCA(k=10, inputCol="features", outputCol="pca")

model = pca.fit(featuresDF)

pc = model.transform(featuresDF)

pc.toPandas()[:3]

Unnamed: 0,features,pca
0,"[0.19, 0.33, 0.02, 0.9, 0.12, 0.17, 0.34, 0.47...","[1.2138889197, 0.564567759337, -0.022284837106..."
1,"[0.0, 0.16, 0.12, 0.74, 0.45, 0.07, 0.26, 0.59...","[0.627985190195, 1.16689414866, -0.51416430664..."
2,"[0.0, 0.42, 0.49, 0.56, 0.17, 0.04, 0.39, 0.47...","[0.234349043189, 0.348070144228, 0.54876884160..."


In [32]:
sc

<pyspark.context.SparkContext at 0x7feee903f7d0>

In [33]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## R Formula

In [34]:
crimes = sqlc.read.format("com.databricks.spark.csv") \
            .option("delimiter", ",") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load("UScrime2-colsLotsOfNAremoved.csv")

In [35]:
from pyspark.ml.feature import RFormula

formula = RFormula().setFormula(" ViolentCrimesPerPop ~ householdsize + racepctblack + racePctWhite ") \
                    .setFeaturesCol("features") \
                    .setLabelCol("label")

In [36]:
output = formula.fit(crimes).transform(crimes)

In [37]:
output.select("features", "label").show(3)

+----------------+-----+
|        features|label|
+----------------+-----+
| [0.33,0.02,0.9]|  0.2|
|[0.16,0.12,0.74]| 0.67|
|[0.42,0.49,0.56]| 0.43|
+----------------+-----+
only showing top 3 rows

