In [1]:
import findspark
findspark.init('/home/danielf/spark-3.3.1-bin-hadoop3')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram

In [2]:
spark = SparkSession.builder.appName('npl').getOrCreate()

23/03/06 09:46:58 WARN Utils: Your hostname, spark resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/03/06 09:46:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/06 09:46:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/06 09:47:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/03/06 09:47:01 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/03/06 09:47:01 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/03/06 09:47:01 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/03/06 09:47:01 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


In [3]:
sen_df = spark.createDataFrame([
    (0, 'Hi i heard about spark'),
    (1, 'I wish java cloud use case classes'),
    (2, 'Logistic,regression,models,are,neat')
], ['id', 'sentence'])

In [4]:
sen_df.show()

                                                                                

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi i heard about ...|
|  1|I wish java cloud...|
|  2|Logistic,regressi...|
+---+--------------------+



In [5]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [6]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [7]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [8]:
tokenized = tokenizer.transform(sen_df)

In [9]:
tokenized.show()

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi i heard about ...|[hi, i, heard, ab...|
|  1|I wish java cloud...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [10]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

                                                                                

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi i heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java cloud...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [11]:
rg_tokenized = regex_tokenizer.transform(sen_df)
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi i heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java cloud...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [12]:
sent_df = spark.createDataFrame([
    (0, ['I','saw ','the','light']),
    (0, ['Mary','had','a','little', 'lamb'])
], ['id', 'tokens'])

In [13]:
sent_df.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw , the, li...|
|  0|[Mary, had, a, li...|
+---+--------------------+



In [14]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [15]:
remover.transform(sent_df).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw , the, li...|       [saw , light]|
|  0|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [25]:
word_df = spark.createDataFrame([
    (0, ['Hi','i','heard','about','spark']),
    (1, ['I','wish','java','could','use','case','classes']),
    (2, ['Logistic','regression','models','are','neat'])
], ['id', 'words'])

In [26]:
ngram = NGram(n=2, inputCol='words', outputCol='gram')

In [27]:
ng = ngram.transform(word_df)

In [30]:
ng.select('gram').show(truncate=False)

+------------------------------------------------------------------+
|gram                                                              |
+------------------------------------------------------------------+
|[Hi i, i heard, heard about, about spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [31]:
ng.show()

+---+--------------------+--------------------+
| id|               words|                gram|
+---+--------------------+--------------------+
|  0|[Hi, i, heard, ab...|[Hi i, i heard, h...|
|  1|[I, wish, java, c...|[I wish, wish jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+

