In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Day21_ex1")\
    .getOrCreate()

In [3]:
from pyspark.sql import types as T
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]
city_lookup = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]

sales_schema = T.StructType([
    T.StructField("txn_id",     T.StringType(), True),
    T.StructField("city",       T.StringType(), True),
    T.StructField("product",    T.StringType(), True),
    T.StructField("category",   T.StringType(), True),   # raw as-is (mixed case/spaces)
    T.StructField("amount",     T.StringType(), True),   # raw string (may be '', 'invalid', None)
    T.StructField("order_date", T.StringType(),True),   # mixed formats
    T.StructField("status",     T.StringType(), True),   # truncated/variants
])

customer_schema = T.StructType([
    T.StructField("customer_id", T.StringType(), True),
    T.StructField("city",        T.StringType(), True),
    T.StructField("segment",     T.StringType(), True),
])

lookup_schema = T.StructType([
    T.StructField("city", T.StringType(), False),
    T.StructField("tier", T.StringType(), False),
])

sales_df = spark.createDataFrame(sales_data, schema=sales_schema)
customers_df = spark.createDataFrame(customer_data, schema=customer_schema)
city_lookup_df = spark.createDataFrame(city_lookup, schema=lookup_schema)







In [11]:
from pyspark.sql.functions import col,to_date,coalesce
#clean_df=sales_df.withColumn("amount",col("amount").cast(T.IntegerType()))

clean_df=sales_df.withColumn("order_date",coalesce(to_date(col("order_date"),"dd/MM/yyyy"),to_date(col("order_date"),"yyyy-MM-dd")
  )
)
#clean_df.show()


In [13]:
from pyspark.sql.functions import col
invalid_df=clean_df.filter(col("amount").isNull() | col("order_date").isNull())

#invalid_df.show()

Phase 2

In [15]:

sales_df = sales_df.withColumn("city", F.upper(F.trim(F.col("city")))) \
                   .withColumn("product", F.upper(F.trim(F.col("product")))) \
                  .withColumn("category", F.upper(F.trim(F.col("category"))))
sales_df.show()
##5

+------+---------+-------+-----------+-------+----------+---------+
|txn_id|     city|product|   category| amount|order_date|   status|
+------+---------+-------+-----------+-------+----------+---------+
|TXN001|    DELHI| LAPTOP|ELECTRONICS|  45000|2024-01-05|Completed|
|TXN002|   MUMBAI| MOBILE|ELECTRONICS|  32000|05/01/2024|Completed|
|TXN003|BANGALORE| TABLET|ELECTRONICS|  30000|2024/01/06|Completed|
|TXN004|    DELHI| LAPTOP|ELECTRONICS|       |2024-01-07|Cancelled|
|TXN005|  CHENNAI| MOBILE|ELECTRONICS|invalid|2024-01-08|Completed|
|TXN006|   MUMBAI| TABLET|ELECTRONICS|   NULL|2024-01-08|Completed|
|TXN007|    DELHI| LAPTOP|ELECTRONICS|  45000|09-01-2024|Completed|
|TXN008|BANGALORE| MOBILE|ELECTRONICS|  28000|2024-01-09|Completed|
|TXN009|   MUMBAI| LAPTOP|ELECTRONICS|  55000|2024-01-10|Completed|
|TXN009|   MUMBAI| LAPTOP|ELECTRONICS|  55000|2024-01-10|Completed|
+------+---------+-------+-----------+-------+----------+---------+



In [16]:

sales_df = sales_df.withColumn("category", F.upper(F.trim(F.col("category"))))
sales_df.show()##6


+------+---------+-------+-----------+-------+----------+---------+
|txn_id|     city|product|   category| amount|order_date|   status|
+------+---------+-------+-----------+-------+----------+---------+
|TXN001|    DELHI| LAPTOP|ELECTRONICS|  45000|2024-01-05|Completed|
|TXN002|   MUMBAI| MOBILE|ELECTRONICS|  32000|05/01/2024|Completed|
|TXN003|BANGALORE| TABLET|ELECTRONICS|  30000|2024/01/06|Completed|
|TXN004|    DELHI| LAPTOP|ELECTRONICS|       |2024-01-07|Cancelled|
|TXN005|  CHENNAI| MOBILE|ELECTRONICS|invalid|2024-01-08|Completed|
|TXN006|   MUMBAI| TABLET|ELECTRONICS|   NULL|2024-01-08|Completed|
|TXN007|    DELHI| LAPTOP|ELECTRONICS|  45000|09-01-2024|Completed|
|TXN008|BANGALORE| MOBILE|ELECTRONICS|  28000|2024-01-09|Completed|
|TXN009|   MUMBAI| LAPTOP|ELECTRONICS|  55000|2024-01-10|Completed|
|TXN009|   MUMBAI| LAPTOP|ELECTRONICS|  55000|2024-01-10|Completed|
+------+---------+-------+-----------+-------+----------+---------+



In [19]:
from pyspark.sql import functions as F

sales_df = sales_df.withColumn("amount_int", F.regexp_extract(F.col("amount"), r"\d+", 0).cast("int")) \
                   .withColumn("amount_invalid", F.col("amount_int").isNull())
sales_df.show()


{"ts": "2025-12-23 05:41:29.350", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"INT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 3 in cell [19]", "line": "", "fragment": "cast", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o420.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"INT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"cast\" was called from\nline 3 in cell [19]\n\n\tat org.apache.spark.sql.errors.QueryExecutionErrors$.invali

NumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"cast" was called from
line 3 in cell [19]
