In [1]:
sc

<pyspark.context.SparkContext at 0x7fa56283e790>

In [5]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Categorical Features

In [6]:
df = sqlc.createDataFrame([(0, "US"), (1, "UK"), (2, "FR"),(3, "US"), (4, "US"), (5, "FR")]).toDF("id", "nationality")

### String Indexer

In [7]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer().setInputCol("nationality").setOutputCol("nIndex")

In [8]:
indexed = indexer.fit(df).transform(df)

In [9]:
indexed.show()

+---+-----------+------+
| id|nationality|nIndex|
+---+-----------+------+
|  0|         US|   0.0|
|  1|         UK|   2.0|
|  2|         FR|   1.0|
|  3|         US|   0.0|
|  4|         US|   0.0|
|  5|         FR|   1.0|
+---+-----------+------+



### IndexToString

In [12]:
from pyspark.ml.feature import IndexToString

converter = IndexToString().setInputCol("predictedIndex").setOutputCol("predictedNationality")

In [14]:
predictions = indexed.selectExpr("nIndex as predictedIndex")

In [15]:
converter.transform(predictions).show()

+--------------+--------------------+
|predictedIndex|predictedNationality|
+--------------+--------------------+
|           0.0|                  US|
|           2.0|                  UK|
|           1.0|                  FR|
|           0.0|                  US|
|           0.0|                  US|
|           1.0|                  FR|
+--------------+--------------------+



### OneHotEncoder

In [19]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder().setInputCol("nIndex").setOutputCol("nVector")

In [20]:
encoded = encoder.transform(indexed)

In [21]:
encoded.show()

+---+-----------+------+-------------+
| id|nationality|nIndex|      nVector|
+---+-----------+------+-------------+
|  0|         US|   0.0|(2,[0],[1.0])|
|  1|         UK|   2.0|    (2,[],[])|
|  2|         FR|   1.0|(2,[1],[1.0])|
|  3|         US|   0.0|(2,[0],[1.0])|
|  4|         US|   0.0|(2,[0],[1.0])|
|  5|         FR|   1.0|(2,[1],[1.0])|
+---+-----------+------+-------------+



In [22]:
encoder = OneHotEncoder().setInputCol("nIndex").setOutputCol("nVector").setDropLast(False)

In [23]:
encoded = encoder.transform(indexed)

In [24]:
encoded.show()

+---+-----------+------+-------------+
| id|nationality|nIndex|      nVector|
+---+-----------+------+-------------+
|  0|         US|   0.0|(3,[0],[1.0])|
|  1|         UK|   2.0|(3,[2],[1.0])|
|  2|         FR|   1.0|(3,[1],[1.0])|
|  3|         US|   0.0|(3,[0],[1.0])|
|  4|         US|   0.0|(3,[0],[1.0])|
|  5|         FR|   1.0|(3,[1],[1.0])|
+---+-----------+------+-------------+

