## Word counter using PySpark

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [35]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.text("./book-asset.txt")
df = df.filter(F.col("value") != "") # Remove empty rows

In [36]:
df.head(5)

[Row(value='The Project Gutenberg eBook of Frankenstein, by Mary Wollstonecraft (Godwin) Shelley'),
 Row(value='This eBook is for the use of anyone anywhere in the United States and'),
 Row(value='most other parts of the world at no cost and with almost no restrictions'),
 Row(value='whatsoever. You may copy it, give it away or re-use it under the terms'),
 Row(value='of the Project Gutenberg License included with this eBook or online at')]

In [29]:
word_counts = (
    df.withColumn("word", F.explode(F.split(F.col("value"), "\s+")))
    .withColumn("word", F.regexp_replace("word", "[^\w]", ""))
    .groupBy("word")
    .count()
    .sort("count", ascending=False)
)

In [30]:
word_counts.head(5)

                                                                                

[Row(word='the', count=4073),
 Row(word='and', count=2998),
 Row(word='I', count=2840),
 Row(word='of', count=2748),
 Row(word='to', count=2154)]

In [31]:
# Top 10
word_counts.show(10)



+----+-----+
|word|count|
+----+-----+
| the| 4073|
| and| 2998|
|   I| 2840|
|  of| 2748|
|  to| 2154|
|  my| 1632|
|   a| 1395|
|  in| 1137|
| was| 1020|
|that| 1015|
+----+-----+
only showing top 10 rows



                                                                                

In [32]:
# All words count
word_counts.agg(F.sum("count").alias("count_all_words")).show()



+---------------+
|count_all_words|
+---------------+
|          78214|
+---------------+



                                                                                

In [33]:
# Whale count
word_counts.filter(F.col("word").rlike("(?i)whale")).agg(
    F.sum("count").alias("whale_count")
).show()



+-----------+
|whale_count|
+-----------+
|          4|
+-----------+



                                                                                

In [34]:
# Unique count
print("Unique words: ", word_counts.count())



Unique words:  8016


                                                                                