# Dataframe Deep Dive (Part 4)

In [1]:
spark

In [2]:
sc

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

## Column operations and Dropping duplicates

In this section we explore few transformations on columns:

    - how to add a new column - withColumn()
    - how to rename an existing column - withColumnRenamed()
    - how to drop a column - drop() 
    - how to modify a column without any condition 
    - how to modify a column with any condition 
    - how to pass expression to select using `expr`
    - dropping duplicate rows
    

## Dataframe Transformations 

In [4]:
data_set = 's3://fcc-spark-example/dataset/2023/myorder_data'

df = (spark.read
           .format('csv')
           .option('inferSchema', True)
           .option('header', True)
           .load(data_set)
     )

                                                                                

In [5]:
df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|TotalPrice|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|      17.7|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|      60.0|
+---------+---------+--------------------+--------+---------------+---------+---

In [6]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- TotalPrice: double (nullable = true)



### Drop a column

In [7]:
# Dropping the column "TotalPrice"
df1 = df.drop(F.col('TotalPrice'))

In [8]:
df1.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



### Create a new column

#### Using `withColumn`

In [9]:
df2 = df1.withColumn('TotalPrice', F.round(F.col('Quantity') * F.col('UnitPrice'), 2))
df2.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|TotalPrice|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|      17.7|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|      60.0|
+---------+---------+--------------------+--------+---------------+---------+---

#### Using `expression`

In [10]:
df1.select('*').show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [11]:
df1.select('StockCode', 'Quantity', 'CustomerID').show(5)

+---------+--------+----------+
|StockCode|Quantity|CustomerID|
+---------+--------+----------+
|     null|      24|     14688|
|     null|      24|     14688|
|    84991|      24|     14688|
|   84519A|       6|     14688|
|   85183B|      48|     14688|
+---------+--------+----------+
only showing top 5 rows



In [12]:
## This will throw an exception 
## [as we are expecting Spark to evaluate it as an expression, but its reading as a column name]

# df1.select('*', 'Quantity * UnitPrice AS TotalPrice').show(5)

In [13]:
# With F.expr()
df1.select('*', F.expr('round(Quantity * UnitPrice, 2) AS TotalPrice')).show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|TotalPrice|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|      17.7|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|      60.0|
+---------+---------+--------------------+--------+---------------+---------+---

In [14]:
# With selectExpr()
df1.selectExpr('*','round(Quantity * UnitPrice, 2) AS TotalPrice').show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|TotalPrice|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|      13.2|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|      17.7|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|      60.0|
+---------+---------+--------------------+--------+---------------+---------+---

### Modify a column `without` any condition

In [15]:
df1.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



#### Using `withColumn`

In [16]:
# Increase the unit price by 30% 
df1_transformed = df1.withColumn('UnitPrice', F.round( F.col('UnitPrice') * 1.3, 2))

In [17]:
df1_transformed.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     3.84|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.63|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



#### Using `expression`

In [18]:
# With selectExpr()
df1.selectExpr('InvoiceNo','StockCode', 'Description', 'Quantity', 
               'InvoiceDate', 
               'round(UnitPrice * 1.3, 2) AS UnitPrice',
               'CustomerID', 'Country').show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.72|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     3.84|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.63|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



### Modify a column `with` condition

In [19]:
df1.groupBy(F.col('Country')).count().show()



+------------------+------+
|           Country| count|
+------------------+------+
|European Community|    61|
|              EIRE|  8196|
|            Norway|  1086|
|         Hong Kong|   288|
|            Poland|   341|
|          Portugal|  1519|
|       Netherlands|  2371|
|           Germany|  9495|
|           Finland|   695|
|             Spain|  2533|
|            Brazil|    32|
|           Bahrain|    19|
|    United Kingdom|495371|
|            Greece|   146|
|       Unspecified|   446|
|            Cyprus|   622|
|            Canada|   151|
|             Japan|   358|
|           Belgium|  2069|
|           Denmark|   389|
+------------------+------+
only showing top 20 rows



                                                                                

Lets say we want to do this :
    
    - Increase the UnitPrice by 10% for UK 
    - Increase the UnitPrice by 20% for Norway 
    - Increase the UnitPrice by 30% for Singapore 
    - Rest all should remain the same 


In [20]:
df1.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [21]:
# Method 1
df2 = df1.withColumn('UnitPrice', F.expr("ROUND(CASE WHEN Country = 'United Kingdom' THEN UnitPrice * 1.1 \
                                                     WHEN Country = 'Norway' THEN UnitPrice * 1.2  \
                                                     WHEN Country = 'Singapore' THEN UnitPrice * 1.3  \
                                                     ELSE UnitPrice END, 2)"))

df2.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     3.25|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.38|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [22]:
# Method 2
df2 = df1.withColumn('UnitPrice', F.round(F.when(F.col('Country') == 'United Kingdom', F.col('UnitPrice') * 1.1)
                                           .when(F.col('Country') == 'Norway', F.col('UnitPrice') * 1.2)
                                           .when(F.col('Country') == 'Singapore', F.col('UnitPrice') * 1.3)
                                           .otherwise(F.col('UnitPrice')), 2))

df2.show(5)


+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.61|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     3.25|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.38|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [23]:
# Verify the change 
df1.filter(F.col('Country') == 'Norway').show(5)

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13.24|     0.42|     12433| Norway|
|   536532|    22444|GROW YOUR OWN PLA...|      96|01-12-2010 13.24|     1.06|     12433| Norway|
|   536532|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13.24|      2.1|     12433| Norway|
|   536532|    21156|RETROSPOT CHILDRE...|       8|01-12-2010 13.24|     1.95|     12433| Norway|
|   536532|    22556|PLASTERS IN TIN C...|      24|01-12-2010 13.24|     1.65|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
only showing top 5 rows



In [24]:
# Verify the change 
df2.filter(F.col('Country') == 'Norway').show(5)

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13.24|      0.5|     12433| Norway|
|   536532|    22444|GROW YOUR OWN PLA...|      96|01-12-2010 13.24|     1.27|     12433| Norway|
|   536532|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13.24|     2.52|     12433| Norway|
|   536532|    21156|RETROSPOT CHILDRE...|       8|01-12-2010 13.24|     2.34|     12433| Norway|
|   536532|    22556|PLASTERS IN TIN C...|      24|01-12-2010 13.24|     1.98|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
only showing top 5 rows



### Dropping duplicate records

In [25]:
# Define the schema of the DataFrame
schema = StructType([
    StructField("InvoiceNo", StringType(), nullable=False),
    StructField("StockCode", StringType(), nullable=False),
    StructField("Description", StringType(), nullable=False),
    StructField("Quantity", IntegerType(), nullable=False),
    StructField("InvoiceDate", StringType(), nullable=False),
    StructField("UnitPrice", FloatType(), nullable=False),
    StructField("CustomerID", IntegerType(), nullable=False),
    StructField("Country", StringType(), nullable=False)
])

# Create the data for the DataFrame
data = [
    ("536532", "84692", "BOX OF 24 COCKTAIL PARASOLS", 50, "01-12-2010 13:24", 0.42, 12433, "Norway"),
    ("536532", "84692", "BOX OF 24 COCKTAIL PARASOLS", 50, "01-12-2010 13:24", 0.42, 12433, "Norway"),
    ("125555", "22899", "CHILDREN'S APRON DOLLY GIRL", 8, "01-12-2010 13:24", 2.1, 12433, "Norway"),
    ("111111", "22899", "CHILDREN'S APRON DOLLY GIRL", 8, "01-12-2010 13:24", 2.1, 12433, "Norway"),
]

# Create the initial DataFrame
df = spark.createDataFrame(data, schema)

In [26]:
df.show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   111111|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+



#### Removing the duplicate rows 

In [27]:
df.distinct().show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   111111|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+



#### Removing the duplicate based on a subset of columns

In [28]:
# It's not able to show all the other columns, so its not giving the result we want 
(
    df
     .select(F.col('InvoiceNo'))
     .distinct()
     .show()
)

+---------+
|InvoiceNo|
+---------+
|   536532|
|   125555|
|   111111|
+---------+



In [29]:
df.dropDuplicates().show()   # Exactly same as df.distinct().show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   111111|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+



In [30]:
df.show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   111111|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+



In [31]:
# Dropping based on 1 column 
df.dropDuplicates(['InvoiceNo']).show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   111111|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+



In [32]:
# Dropping based on more than 1 column 
df.dropDuplicates(subset=[col for col in df.columns if col != 'InvoiceNo']).show()

+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+
|   125555|    22899|CHILDREN'S APRON ...|       8|01-12-2010 13:24|      2.1|     12433| Norway|
|   536532|    84692|BOX OF 24 COCKTAI...|      50|01-12-2010 13:24|     0.42|     12433| Norway|
+---------+---------+--------------------+--------+----------------+---------+----------+-------+

