In [2]:
'''
You are working with a text file that contains sentences including punctuation and special characters. Your task is to implement a PySpark program that counts how many times each word appears in the file.

Ignore/Remove any non-alphanumeric characters such as . , ! ? ; : - _ @ # % etc.

The final output must be sorted by word in alphabetical order.

Input Schema & Example
Input File
The input will be with the following schema:

Column Name	Data Type
text	String
Example Input
Hello world! This is a test.
PySpark is-awesome; really awesome?
Words... should be countedâ€”even if separated!
Repeat words, repeat-words; REPEAT words.
Output Schema, Example & Explanation
Output Table (df_result)
Column Name	Data Type
word	String
count	Integer
Example Output Table
Here is your table sorted alphabetically by word:

word	count
a	1
awesome	2
be	1
counted	1
is	1
repeat	3
should	1
test	1
this	1
words	4
ðŸ’¡ Explanation
The data is cleansed.
Words are grouped and counted.
The DataFrame is sorted by word in alphabetical order.
Starter Code
from pyspark.sql import SparkSession

# Step 1: Initialize Spark session
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

text = """
Hello world! Welcome to PySpark.
This platform-is amazing; truly amazing.
Data-engineering, data_engineering... DATA engineering?
Count every-word: every.word; every_word!
Special characters: @#$%,.^&*() should not break the logic.
Repeat repeat-words; REPEAT_words... repeat!
Spark Playground makes learning PySpark fun-really fun.
People test joins, windows, and null-handling daily.
Hands-on practice, real-world problems... genuine learning!
Type code, click run, see results-instantly.
"""

# Step 2: Convert multiline string to a DataFrame
df = spark.createDataFrame([(text,)], ["text"])

# Your transformation code here

# Final DataFrame should be stored in this variable
df_result = ...

# Show output
display(df_result)

'''
# Initialize Spark session
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

text = """
Hello world! Welcome to PySpark.
This platform-is amazing; truly amazing.
Data-engineering, data_engineering... DATA engineering?
Count every-word: every.word; every_word!
Special characters: @#$%,.^&*() should not break the logic.
Repeat repeat-words; REPEAT_words... repeat!
Spark Playground makes learning PySpark fun-really fun.
People test joins, windows, and null-handling daily.
Hands-on practice, real-world problems... genuine learning!
Type code, click run, see results-instantly.
"""

# Step 2: Convert multiline string to a DataFrame
df = spark.createDataFrame([(text,)], ["text"])

df_result = (
  # keep only letters and spaces â€” replace everything else with space
  # (r here is "raw string" in case if we add \s in future if we need to handle whitespace)
  # good practice is to add r
  df.withColumn("clean", F.regexp_replace(F.col("text"), r"[^A-Za-z0-9]+", " "))
  .withColumn("word", F.explode(F.split(F.col("clean"), r"\s+"))) # split into words, r - raw string
  .filter(F.col("word") != "") # remove blanks
  .withColumn("word", F.lower(F.col("word"))) # lowercase words
  .groupBy("word")
  .agg(F.count("word").alias("count"))
  .orderBy("word") #sort alphabetically by word
)

# Display result.
df_result.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|    amazing|    2|
|        and|    1|
|      break|    1|
| characters|    1|
|      click|    1|
|       code|    1|
|      count|    1|
|      daily|    1|
|       data|    3|
|engineering|    3|
|      every|    3|
|        fun|    2|
|    genuine|    1|
|   handling|    1|
|      hands|    1|
|      hello|    1|
|  instantly|    1|
|         is|    1|
|      joins|    1|
|   learning|    2|
+-----------+-----+
only showing top 20 rows
