In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func


In [2]:
spark = SparkSession.builder.appName("WordCount").getOrCreate()
spark


In [7]:
df = spark.read.text("Book")
df.show(5)


+--------------------+
|               value|
+--------------------+
|Self-Employment: ...|
|Achieving Financi...|
|       By Frank Kane|
|                    |
|                    |
+--------------------+
only showing top 5 rows



In [16]:
df_clean = df.select(func.regexp_replace("value", r'\W+', " ").alias("text"))
df_clean.show(5)


+--------------------+
|                text|
+--------------------+
|Self Employment B...|
|Achieving Financi...|
|       By Frank Kane|
|                    |
|                    |
+--------------------+
only showing top 5 rows



In [54]:
# explode and split the words
df_split = df_clean.select(func.explode(func.split("text", " ")).alias("words"))
df_split = df_split.filter(df_split.words != "")
df_split.show(5)


+----------+
|     words|
+----------+
|      Self|
|Employment|
|  Building|
|        an|
|  Internet|
+----------+
only showing top 5 rows



In [60]:
# upper case
df_words = df_split.select(func.upper("words").alias("words"))
df_words.show(5)


+----------+
|     words|
+----------+
|      SELF|
|EMPLOYMENT|
|  BUILDING|
|        AN|
|  INTERNET|
+----------+
only showing top 5 rows



In [61]:
# gorup by word and count them up
df_words.groupBy("words").count().orderBy("count", ascending=False).show(10)


+-----+-----+
|words|count|
+-----+-----+
|  YOU| 1878|
|   TO| 1828|
| YOUR| 1420|
|  THE| 1292|
|    A| 1191|
|   OF|  970|
|  AND|  934|
| THAT|  747|
|   IT|  649|
|   IN|  616|
+-----+-----+
only showing top 10 rows

