In [21]:
import findspark
findspark.init('/home/danielf/spark-3.3.1-bin-hadoop3')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import CountVectorizer

In [4]:
spark = SparkSession.builder.appName('npl2').getOrCreate()

In [6]:
sen_df = spark.createDataFrame([
    (0.0, 'Hi i heard about spark'),
    (0.0, 'I wish java cloud use case classes'),
    (1.0, 'Logistic regression models are neat')
], ['label', 'sentence'])

In [7]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [8]:
words_data = tokenizer.transform(sen_df)

In [10]:
words_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi i heard about spark             |[hi, i, heard, about, spark]              |
|0.0  |I wish java cloud use case classes |[i, wish, java, cloud, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [11]:
hashing_ft = HashingTF(inputCol='words', outputCol='raw_features')

In [12]:
featurized_data = hashing_ft.transform(words_data)

In [13]:
featurized_data.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|        raw_features|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi i heard about ...|[hi, i, heard, ab...|(262144,[18700,19...|
|  0.0|I wish java cloud...|[i, wish, java, c...|(262144,[19036,20...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[46243,58...|
+-----+--------------------+--------------------+--------------------+



In [15]:
idf = IDF(inputCol='raw_features', outputCol='features')

In [16]:
idf_model = idf.fit(featurized_data)

                                                                                

In [17]:
rescaled_data = idf_model.transform(featurized_data)

In [20]:
rescaled_data.select(['label', 'features']).show(truncate=False)

23/03/06 09:55:44 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


[Stage 9:>                                                          (0 + 1) / 1]                                                                                

23/03/06 09:55:45 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                       |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[18700,19036,33808,66273,173558],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                    |
|0.0  |(262144,[19036,20719,55551,98717,109547,137955,192310],[0.28768207245178085,0.6931471805599453,0.6931471805

In [28]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ['id', 'words'])
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [29]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)

In [30]:
model = cv.fit(df)

In [31]:
results = model.transform(df)

In [32]:
results.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

23/03/06 10:11:15 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 124971 ms exceeds timeout 120000 ms
23/03/06 10:11:15 WARN SparkContext: Killing executors is not supported by current scheduler.
