***50 PySpark problems & Solutions***

# ***1. Remove duplicates from a dataset containing customer data***

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("RemoveDuplicates").getOrCreate()

*Dataframe Structure*

In [7]:
data = [("Alice", 30), ("Bob", 25), ("Alice", 30),("Prakash",25)]
columns = ["Name", "Age"]


In [8]:
df = spark.createDataFrame(data, columns)

***The below code will remove the duplicate value from the dataframe ***

In [9]:
df_no_duplicates = df.dropDuplicates()
df_no_duplicates.show()

+-------+---+
|   Name|Age|
+-------+---+
|    Bob| 25|
|  Alice| 30|
|Prakash| 25|
+-------+---+



# ***2. Calculate the average salary of employees***

In [11]:
#Create the SparkSession
from pyspark.sql import SparkSession

In [12]:
#creating the App name
spark = SparkSession.builder.appName("AverageSalary").getOrCreate()

In [13]:
#Below code has the column name  along with data
data = [("Alice", 3000), ("Bob", 4000), ("Charlie", 5000)]
columns = ["Name", "Salary"]
df = spark.createDataFrame(data, columns)

In [14]:
average_salary = df.agg({"Salary": "avg"}).collect()[0][0]
print(f"Average Salary: {average_salary}")

Average Salary: 4000.0


# ***3. Join two datasets (employee and department details)***

***Employees DataFrame***

In [20]:
emp_data = [("Alice", 1), ("Bob", 2)]
emp_columns = ["Name", "DeptID"]
emp_df = spark.createDataFrame(emp_data, emp_columns)

***Department DataFrame***

In [21]:
dept_data = [(1, "HR"), (2, "Finance")]
dept_columns = ["DeptID", "DeptName"]
dept_df = spark.createDataFrame(dept_data, dept_columns)

*Joining the two dataframe*

In [22]:
joined_df = emp_df.join(dept_df, "DeptID")
joined_df.show()

+------+-----+--------+
|DeptID| Name|DeptName|
+------+-----+--------+
|     1|Alice|      HR|
|     2|  Bob| Finance|
+------+-----+--------+



# ***Filter records where a column exceeds a threshold***

In [23]:
df.filter(df["Salary"] > 4000).show()

+-------+------+
|   Name|Salary|
+-------+------+
|Charlie|  5000|
+-------+------+



# ***5. Handle missing values (dropping and imputing)***

In [24]:
data = [("Alice", None), ("Bob", 3000), ("Charlie", None)]
columns = ["Name", "Salary"]
df = spark.createDataFrame(data, columns)

In [25]:
df.show()

+-------+------+
|   Name|Salary|
+-------+------+
|  Alice|  NULL|
|    Bob|  3000|
|Charlie|  NULL|
+-------+------+



In [26]:
# Drop missing values
df_dropped = df.na.drop()

In [27]:
df_dropped.show()

+----+------+
|Name|Salary|
+----+------+
| Bob|  3000|
+----+------+



***Imputing a value to the None ***

In [28]:
# Impute missing values
df_imputed = df.na.fill({"Salary": 4000})
df_dropped.show()
df_imputed.show()

+----+------+
|Name|Salary|
+----+------+
| Bob|  3000|
+----+------+

+-------+------+
|   Name|Salary|
+-------+------+
|  Alice|  4000|
|    Bob|  3000|
|Charlie|  4000|
+-------+------+



# ***6. Count unique words in a large text file***

In [None]:
text_rdd = spark.sparkContext.textFile("path/to/textfile")
word_counts = (text_rdd.flatMap(lambda line: line.split()) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a + b)) word_counts.count()

# ***8. Repartition a DataFrame for improved parallelism***

In [29]:
df_repartitioned = df.repartition(10)
df_repartitioned.show()

+-------+------+
|   Name|Salary|
+-------+------+
|    Bob|  3000|
|Charlie|  NULL|
|  Alice|  NULL|
+-------+------+



# ***9. Sliding window function for moving average***

In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType,IntegerType
import datetime

In [44]:
# Initialize Spark Session
spark = SparkSession.builder.appName("WindowFunctionExample").getOrCreate()

In [45]:
# Sample Data
data = [
    ("2024-01-01", 50000),
    ("2024-01-02", 55000),
    ("2024-01-03", 60000),
    ("2024-01-04", 58000),
    ("2024-01-05", 62000),
    ("2024-01-06", 63000),
    ("2024-01-07", 61000),
]

# Define schema with Salary as IntegerType
schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Salary", IntegerType(), True)
])

In [46]:
# Create DataFrame
df = spark.createDataFrame(data, schema)

In [47]:
# Convert Date column to DateType
from pyspark.sql.functions import to_date
df = df.withColumn("Date", to_date("Date", "yyyy-MM-dd"))

# Define Window Spec: 3-day moving average (current row + 2 preceding)
from pyspark.sql.window import Window
window_spec = Window.orderBy("Date").rowsBetween(-2, 0)

# Add moving average column
from pyspark.sql.functions import avg
df_with_avg = df.withColumn("MovingAvg", avg("Salary").over(window_spec))

# Show result
df_with_avg.show(truncate=False)


+----------+------+------------------+
|Date      |Salary|MovingAvg         |
+----------+------+------------------+
|2024-01-01|50000 |50000.0           |
|2024-01-02|55000 |52500.0           |
|2024-01-03|60000 |55000.0           |
|2024-01-04|58000 |57666.666666666664|
|2024-01-05|62000 |60000.0           |
|2024-01-06|63000 |61000.0           |
|2024-01-07|61000 |62000.0           |
+----------+------+------------------+

