In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Create a sample DataFrame
data = [("A", 1), ("B", 2), ("C", 1), ("D", 3), ("E", 2)]
df = spark.createDataFrame(data, ["col1", "col2"])

# Create a Window partitioned by the specific column
window = Window.partitionBy("col2")

# Count occurrences of the column within the Window
df_with_counts = df.withColumn("count", count("col2").over(window))
df_with_counts.show()

+----+----+-----+
|col1|col2|count|
+----+----+-----+
|   A|   1|    2|
|   C|   1|    2|
|   B|   2|    2|
|   E|   2|    2|
|   D|   3|    1|
+----+----+-----+



In [3]:
# Filter rows with duplicate values in the specific column
filtered_df = df_with_counts.filter(col("count") > 1)

# Show the filtered DataFrame
filtered_df.show()

+----+----+-----+
|col1|col2|count|
+----+----+-----+
|   A|   1|    2|
|   C|   1|    2|
|   B|   2|    2|
|   E|   2|    2|
+----+----+-----+



In [10]:
df.dropDuplicates(["col2"]).show()

+----+----+
|col1|col2|
+----+----+
|   A|   1|
|   B|   2|
|   D|   3|
+----+----+



In [12]:
df.show()

+----+----+
|col1|col2|
+----+----+
|   A|   1|
|   B|   2|
|   C|   1|
|   D|   3|
|   E|   2|
+----+----+

