In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
    .appName("Read CSV example") \
    .getOrCreate()

In [3]:
raw_users = [
  ("U001","Amit","29","Hyderabad","50000"),
  ("U002","Neha","Thirty Two","Delhi","62000"),
  ("U003","Ravi",None,"Bangalore","45k"),
  ("U004","Pooja","28","Mumbai",58000),
  ("U005",None,"31","Chennai","")
]

In [10]:
from pyspark.sql.types import(
    StructType,
    StructField,
    StringType,IntegerType,LongType
)
raw_schema=StructType([
    StructField("user_id",StringType(),nullable=False),
    StructField("name",StringType(),nullable=True),
    StructField("age",StringType(),nullable=True),
    StructField("city",StringType(),nullable=True),
    StructField("salary",StringType(),nullable=True)
])

In [12]:
df_raw=spark.createDataFrame(raw_users,schema=raw_schema)
df_raw.show()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U003| Ravi|      NULL|Bangalore|   45k|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+



In [15]:
from pyspark.sql.functions import expr, col
df_check = (
    df_raw
    .withColumn("age_int", expr("try_cast(age as int)"))
    .withColumn("salary_int", expr("try_cast(salary as int)"))
)
df_failed = df_check.filter(col("age_int").isNull() | col("salary_int").isNull())
df_failed.show()

+-------+----+----------+---------+------+-------+----------+
|user_id|name|       age|     city|salary|age_int|salary_int|
+-------+----+----------+---------+------+-------+----------+
|   U002|Neha|Thirty Two|    Delhi| 62000|   NULL|     62000|
|   U003|Ravi|      NULL|Bangalore|   45k|   NULL|      NULL|
|   U005|NULL|        31|  Chennai|      |     31|      NULL|
+-------+----+----------+---------+------+-------+----------+



In [16]:
from pyspark.sql.functions import coalesce,lit
df_raw.withColumn("name",coalesce(col("name"),lit("UNKNOWN"))).show()

+-------+-------+----------+---------+------+
|user_id|   name|       age|     city|salary|
+-------+-------+----------+---------+------+
|   U001|   Amit|        29|Hyderabad| 50000|
|   U002|   Neha|Thirty Two|    Delhi| 62000|
|   U003|   Ravi|      NULL|Bangalore|   45k|
|   U004|  Pooja|        28|   Mumbai| 58000|
|   U005|UNKNOWN|        31|  Chennai|      |
+-------+-------+----------+---------+------+



In [17]:
df_clean=df_raw.filter(col("age").isNotNull())
df_clean.show()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+

