In [8]:
from pyspark.sql import Row

from pyspark.sql import SparkSession
from pyspark.sql.functions import col 

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]
     

import pandas as pd
     

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)
     

users_df = spark.createDataFrame(pd.DataFrame(users))
     

users_df.show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [9]:
help(users_df.drop)

Help on method drop in module pyspark.sql.dataframe:

drop(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` that drops the specified column.
    This is a no-op if schema doesn't contain the given column name(s).
    
    .. versionadded:: 1.4.0
    
    Parameters
    ----------
    cols: str or :class:`Column`
        a name of the column, or the :class:`Column` to drop
    
    Examples
    --------
    >>> df.drop('age').collect()
    [Row(name='Alice'), Row(name='Bob')]
    
    >>> df.drop(df.age).collect()
    [Row(name='Alice'), Row(name='Bob')]
    
    >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()
    [Row(age=5, height=85, name='Bob')]
    
    >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()
    [Row(age=5, name='Bob', height=85)]
    
    >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()
    [Row(name='Bob')]



In [24]:
users_df.drop("last_updated_ts").printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)



In [22]:
users_df.drop(users_df["last_updated_ts"]).printSchema()

# users_df.drop(col("last_updated_ts")).printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)



In [13]:
# If we have column name which does not exist, the column will be ignored

users_df.drop(col('user_id')).printSchema()

# this will not throw any error

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



# Dropping multiple columns

In [28]:
# To drop multiple columns, u need to use column names as strings

users_df.drop("first_name","last_name").show()

+---+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|krome4@shutterfly...|{+1 817 934 7142,...|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+--------------------+--------------------+-------+-----------+-----------+---------

In [20]:


users_df.drop(col("last_updated_ts"),col("id")).printSchema()

# this throw error as to drop multiple columns, we need to use strings only not the column objects


TypeError: each col in the param list should be a string

In [29]:
# If we have column name which does not exist, the column will be ignored


users_df.drop('user_id', 'first_name', 'last_name').printSchema()
     

root
 |-- id: long (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [32]:
pii_columns = ['first_name', 'last_name', 'email', 'phone_numbers', 'ssn_doesnot_exist']

users_df_nopii = users_df.drop(*pii_columns) # we have to unpack the list

users_df_nopii.printSchema()

root
 |-- id: long (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



## Dropping Duplicate Records

In [46]:

# Drop duplicates based on certain columns.

# We can use distinct, drop_duplicates or dropDuplicates for these scenarios.

import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 1050.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 25, 3, 33, 0)
    }
]


import pandas as pd
users_df = spark.createDataFrame(pd.DataFrame(users))

help(users_df.distinct)

Help on method distinct in module pyspark.sql.dataframe:

distinct() method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> df.distinct().count()
    2



In [47]:
users_df.distinct().show()

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|      false|        NaN|         null|2021-04-02 00:55:18|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|     1050.0|   2021-02-14|2021-02-25 03:33:00|
+---+----------+---

In [48]:
users_df.distinct().count()

6

In [49]:
help(users_df.drop_duplicates)

# this is an alias of dropDuplicates

Help on method dropDuplicates in module pyspark.sql.dataframe:

dropDuplicates(subset=None) method of pyspark.sql.dataframe.DataFrame instance
    :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
    
    .. versionadded:: 1.4



In [50]:
help(users_df.dropDuplicates)

Help on method dropDuplicates in module pyspark.sql.dataframe:

dropDuplicates(subset=None) method of pyspark.sql.dataframe.DataFrame instance
    Return a new :class:`DataFrame` with duplicate rows removed,
    optionally only considering certain columns.
    
    For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
    :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
    duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
    be and system will accordingly limit the state. In addition, too late data older than
    watermark will be dropped to avoid any possibility of duplicates.
    
    :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
    
    .. versionadded:: 1.4.0
    
    Examples
    --------
    >>> from pyspark.sql import Row
    >>> df = sc.parallelize([ \
    ...     Row(name='Alice', age=5, height=80), \
    ...     Row(name='Alice', age=5, height=8

In [54]:


# we should use a list not a string

users_df.dropDuplicates(['id']).show()



# We can also drop duplicates based on certain columns
# This will fail as the function expects sequence type object such as list or array
users_df.dropDuplicates('id').show()



+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+



TypeError: Parameter 'subset' must be a list of columns

In [59]:
users_df.dropDuplicates(['id', 'amount_paid']).show()

# Removes duplicates if the combination of those 2 columns occurs multiple times

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|     1050.0|   2021-02-14|2021-02-25 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---

## Dropping based on null Values

In [60]:
# Dropping records based on null values.

# Drop records when all column values are nulls.

# Drop records any of the column value is null.

# Drop records that have less than thresh non-null values.

# Drop records when any of the column value or all column values are nulls for provided subset of columns.

# We can use df.na.drop or df.dropna to take care of dealing with records having columns with null values.

In [61]:
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": 5,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": None,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": None,
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 1050.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 25, 3, 33, 0)
    }
]


import pandas as pd
users_df = spark.createDataFrame(pd.DataFrame(users))
     

users_df.show()
     

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|1.0|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|NaN|      null|        null|                null|       null|        NaN|         null|               null|
|2.0|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|4.0|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|4.0|     Ashby|   

In [63]:
help(users_df.na.drop)

# this is an alias of dropna

Help on method drop in module pyspark.sql.dataframe:

drop(how='any', thresh=None, subset=None) method of pyspark.sql.dataframe.DataFrameNaFunctions instance
    Returns a new :class:`DataFrame` omitting rows with null values.
    :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
    
    .. versionadded:: 1.3.1
    
    Parameters
    ----------
    how : str, optional
        'any' or 'all'.
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.
    thresh: int, optional
        default None
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.
    subset : str, tuple or list, optional
        optional list of column names to consider.
    
    Examples
    --------
    >>> df4.na.drop().show()
    +---+------+-----+
    |age|height| name|
    +---+------+-----+
    | 10|    80|Alice|
    +---+------+-----+



In [64]:
help(users_df.dropna)

Help on method dropna in module pyspark.sql.dataframe:

dropna(how='any', thresh=None, subset=None) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` omitting rows with null values.
    :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
    
    .. versionadded:: 1.3.1
    
    Parameters
    ----------
    how : str, optional
        'any' or 'all'.
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.
    thresh: int, optional
        default None
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.
    subset : str, tuple or list, optional
        optional list of column names to consider.
    
    Examples
    --------
    >>> df4.na.drop().show()
    +---+------+-----+
    |age|height| name|
    +---+------+-----+
    | 10|    80|Alice|
    +---+------+-----+



23/07/01 20:45:52 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 7183133 ms exceeds timeout 120000 ms
23/07/01 20:45:52 WARN SparkContext: Killing executors is not supported by current scheduler.
