In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('stringIndexer').config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

## StringIndexer

In [9]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame([
    (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")
], ["id", "category"])

indexer =  StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)

In [10]:
display(type(indexed))
display(indexed.printSchema())
display(indexed.show())

pyspark.sql.dataframe.DataFrame

root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- categoryIndex: double (nullable = false)



None

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



None

### Note
    - 需要先`fit`再做`transform`

## IndexToString

In [17]:
from pyspark.ml.feature import IndexToString

converted = IndexToString(inputCol='categoryIndex', outputCol="originalCategory")
convertedDF = converted.transform(indexed)
print("Transformed indexed column {} back to original string column {} using "
      "labels in metadata".format(converted.getInputCol(),converted.getOutputCol()))

Transformed indexed column categoryIndex back to original string column originalCategory using labels in metadata


In [18]:
display(type(convertedDF))
display(convertedDF.printSchema())
convertedDF.show()

pyspark.sql.dataframe.DataFrame

root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- categoryIndex: double (nullable = false)
 |-- originalCategory: string (nullable = true)



None

+---+--------+-------------+----------------+
| id|category|categoryIndex|originalCategory|
+---+--------+-------------+----------------+
|  0|       a|          0.0|               a|
|  1|       b|          2.0|               b|
|  2|       c|          1.0|               c|
|  3|       a|          0.0|               a|
|  4|       a|          0.0|               a|
|  5|       c|          1.0|               c|
+---+--------+-------------+----------------+

