In [7]:
'''
You are provided with multiple CSV files containing customer information, stored in the /datasets/customers_data/ directory. Each file has the same schema but represents data for different time periods.

Your task is to load all CSV files together and include additional metadata columns that capture:

the full file path,
the file name, and
the file size for each record.
Input
File Path: /datasets/customers_data/*.csv

Schema:

customer_id (int)
name (string)
city (string)
age (int)
Example Input Table (from one CSV file):

customer_id	name	city	age
101	John	Delhi	29
102	Ravi	Pune	34
Example Files:

/datasets/customers_data/customers_2024_01.csv
/datasets/customers_data/customers_2024_02.csv
Output
Add metadata columns to each record indicating which file it came from and the fileâ€™s details.

Output Schema:

customer_id (int)
name (string)
city (string)
age (int)
file_path (string)
file_name (string)
file_size (long)
Example Output Table:

customer_id	name	city	age	file_path	file_name	file_size
101	John	Delhi	29	file:/datasets/customers_data/customers_2024_01.csv	customers_2024_01.csv	2048
102	Ravi	Pune	34	file:/datasets/customers_data/customers_2024_02.csv	customers_2024_02.csv	1024

'''

# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

schema = StructType([
  StructField("customer_id", IntegerType(), True),
  StructField("name", StringType(), True),
  StructField("city", StringType(), True),
  StructField("age", IntegerType(), True),
])

df = (
  spark.read
  .option("header", True)
  .schema(schema)
  .csv("./customers_data/")
)

df_result = (
  df.withColumn("file_path", F.input_file_name())
  .withColumn("file_name",
              F.regexp_extract(F.input_file_name(), r"([^/]+$)", 1))
  .withColumn("file_size", F.input_file_block_length())
  .orderBy("customer_id")
)

# Display result.
df_result.show()

+-----------+-----+---------+---+--------------------+--------------------+---------+
|customer_id| name|     city|age|           file_path|           file_name|file_size|
+-----------+-----+---------+---+--------------------+--------------------+---------+
|        101| John|    Delhi| 29|file:///Users/nit...|customers_2024_01...|       83|
|        102| Ravi|     Pune| 34|file:///Users/nit...|customers_2024_01...|       83|
|        103|Alice|Bangalore| 27|file:///Users/nit...|customers_2024_01...|       83|
|        104|Maria|   Mumbai| 31|file:///Users/nit...|customers_2024_02...|       88|
|        105| Chen|Hyderabad| 36|file:///Users/nit...|customers_2024_02...|       88|
|        106|Arjun|  Chennai| 24|file:///Users/nit...|customers_2024_02...|       88|
|        107| Sara|  Kolkata| 28|file:///Users/nit...|customers_2024_03...|       87|
|        108|David|    Noida| 41|file:///Users/nit...|customers_2024_03...|       87|
|        109|Priya|Ahmedabad| 33|file:///Users/nit...|