In [1]:
## create a spark instance
from pyspark.sql import SparkSession
# initialise sparkContext
spark = SparkSession.builder \
    .master('local') \
    .appName('muthootSample1') \
    .config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

## Tokenization

In [29]:
## example
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic regression, models, are, neat")
], ["id", "sentence"])
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic regressi...|
+---+--------------------+



In [21]:
# Create an instance of tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
# Tokenize the data
tokenized = tokenizer.transform(sentenceDataFrame)

display(type(tokenized))
display(tokenized.printSchema())
tokenized.show(truncate=False)

pyspark.sql.dataframe.DataFrame

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



None

+---+--------------------------------------+--------------------------------------------+
|id |sentence                              |words                                       |
+---+--------------------------------------+--------------------------------------------+
|0  |Hi I heard about Spark                |[hi, i, heard, about, spark]                |
|1  |I wish Java could use case classes    |[i, wish, java, could, use, case, classes]  |
|2  |Logistic regression, models, are, neat|[logistic, regression,, models,, are,, neat]|
+---+--------------------------------------+--------------------------------------------+



#### Note
- 結果是dataframe
    - 存在outputCol 中為array
- 自動將大寫轉成小寫
- 依據whitespace做切割

In [16]:
# Create a user defined function for counting
## lambda function + datatype

countTokens = udf(lambda words: len(words), IntegerType())

In [19]:
tokenized.withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+---+--------------------------------------+--------------------------------------------+------+
|id |sentence                              |words                                       |tokens|
+---+--------------------------------------+--------------------------------------------+------+
|0  |Hi I heard about Spark                |[hi, i, heard, about, spark]                |5     |
|1  |I wish Java could use case classes    |[i, wish, java, could, use, case, classes]  |7     |
|2  |Logistic regression, models, are, neat|[logistic, regression,, models,, are,, neat]|5     |
+---+--------------------------------------+--------------------------------------------+------+



### Tokenization with Regex

In [42]:
# Create an instance of regexTokenizer
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")  ### \\W = [^a-zA-Z0-9_], i.e., 找尋非文字數字當作切割
# Tokenize the data with the pattern
regexTokenized = regexTokenizer.transform(sentenceDataFrame)

display(type(regexTokenized))
display(regexTokenized.printSchema())
regexTokenized.show(truncate=False)

pyspark.sql.dataframe.DataFrame

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



None

+---+--------------------------------------+------------------------------------------+
|id |sentence                              |words                                     |
+---+--------------------------------------+------------------------------------------+
|0  |Hi I heard about Spark                |[hi, i, heard, about, spark]              |
|1  |I wish Java could use case classes    |[i, wish, java, could, use, case, classes]|
|2  |Logistic regression, models, are, neat|[logistic, regression, models, are, neat] |
+---+--------------------------------------+------------------------------------------+



In [43]:
regexTokenized.withColumn("regTokens", countTokens(col("words"))).show(truncate=False)

+---+--------------------------------------+------------------------------------------+---------+
|id |sentence                              |words                                     |regTokens|
+---+--------------------------------------+------------------------------------------+---------+
|0  |Hi I heard about Spark                |[hi, i, heard, about, spark]              |5        |
|1  |I wish Java could use case classes    |[i, wish, java, could, use, case, classes]|7        |
|2  |Logistic regression, models, are, neat|[logistic, regression, models, are, neat] |5        |
+---+--------------------------------------+------------------------------------------+---------+



#### Note
- 結果是dataframe
    - 存在outputCol 中為array
- 自動將大寫轉成小寫
- 依據regex pattern做切割