**How to return list of all columns as StructField objects?**

| Code Part                                                    | Purpose                                    |
| ------------------------------------------------------------ | ------------------------------------------ |
| `df.schema`                                                  | Returns full schema object                 |
| `df.schema.fields`                                           | Returns **list of all columns** as **StructField** objects |
| `field.name`                                                 | Gets the **column name** from each **StructField** |
| `field.dataType`                                             | Column data type                           |
| `field.nullable`                                             | Whether nulls are allowed                  |
| `for field in df.schema.fields'                              | Iterate over all schema fields             |
| `any(field.name == "colname" for field in df.schema.fields)` | Check if column exists (Returns **True** if any field matches) |
| `[field.name for field in df.schema.fields]`                 | Extract list of column names               |

In [0]:
df = spark.read.csv("/Volumes/@azureadb/pyspark/unionby/company_level.csv", header=True, inferSchema=True)
display(df.limit(5))

start_date,product_url,category,default_group,source_target,cloud_flatform,session_id,session_name,status_name,status_type,sessions,product_id,load datetime
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876543,first_visit,first_visit,Not Available,5,409516064,2025-09-02T19:10:35
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876544,purchase,organic,Not Available,12,409516064,2025-09-02T19:10:36
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876545,search,network,Not Available,16,409516064,2025-09-02T19:10:37
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876546,search,scroll,Not Available,22,409516064,2025-09-02T19:10:38
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876547,search,organic,Not Available,25,409516064,2025-09-02T19:10:39


In [0]:
# Display schema (structure of the DataFrame)
df.printSchema()

root
 |-- start_date: date (nullable = true)
 |-- product_url: string (nullable = true)
 |-- category: string (nullable = true)
 |-- default_group: string (nullable = true)
 |-- source_target: string (nullable = true)
 |-- cloud_flatform: string (nullable = true)
 |-- session_id: integer (nullable = true)
 |-- session_name: string (nullable = true)
 |-- status_name: string (nullable = true)
 |-- status_type: string (nullable = true)
 |-- sessions: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- load datetime: string (nullable = true)



In [0]:
# Access all fields (columns) in schema
print("\n=== All Fields in Schema ===")
print(df.schema.fields)


=== All Fields in Schema ===
[StructField('start_date', DateType(), True), StructField('product_url', StringType(), True), StructField('category', StringType(), True), StructField('default_group', StringType(), True), StructField('source_target', StringType(), True), StructField('cloud_flatform', StringType(), True), StructField('session_id', IntegerType(), True), StructField('session_name', StringType(), True), StructField('status_name', StringType(), True), StructField('status_type', StringType(), True), StructField('sessions', IntegerType(), True), StructField('product_id', IntegerType(), True), StructField('load datetime', StringType(), True)]


In [0]:
for field in df.schema.fields:
    print(field)

StructField('start_date', DateType(), True)
StructField('product_url', StringType(), True)
StructField('category', StringType(), True)
StructField('default_group', StringType(), True)
StructField('source_target', StringType(), True)
StructField('cloud_flatform', StringType(), True)
StructField('session_id', IntegerType(), True)
StructField('session_name', StringType(), True)
StructField('status_name', StringType(), True)
StructField('status_type', StringType(), True)
StructField('sessions', IntegerType(), True)
StructField('product_id', IntegerType(), True)
StructField('load datetime', StringType(), True)


In [0]:
StructField_names = [field for field in df.schema.fields]
print(StructField_names)

[StructField('start_date', DateType(), True), StructField('product_url', StringType(), True), StructField('category', StringType(), True), StructField('default_group', StringType(), True), StructField('source_target', StringType(), True), StructField('cloud_flatform', StringType(), True), StructField('session_id', IntegerType(), True), StructField('session_name', StringType(), True), StructField('status_name', StringType(), True), StructField('status_type', StringType(), True), StructField('sessions', IntegerType(), True), StructField('product_id', IntegerType(), True), StructField('load datetime', StringType(), True)]


In [0]:
df.schema['start_date'].dataType

DateType()

In [0]:
# Print column name, data type, and nullable for each field
print("\n=== Detailed Field Information ===")
for field in df.schema.fields:
    print(f"Column Name   : {field.name}")
    print(f"Data Type     : {field.dataType}")
    print(f"Nullable      : {field.nullable}")
    print("-" * 35)


=== Detailed Field Information ===
Column Name   : start_date
Data Type     : DateType()
Nullable      : True
-----------------------------------
Column Name   : product_url
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : category
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : default_group
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : source_target
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : cloud_flatform
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : session_id
Data Type     : IntegerType()
Nullable      : True
-----------------------------------
Column Name   : session_name
Data Type     : StringType()
Nullable      : True
-----------------------------------
Column Name   : status_name
Data Type     : String

In [0]:
# Extract only column names using a list comprehension
column_names = [field.name for field in df.schema.fields]
print("\nColumn Names:", column_names)


Column Names: ['start_date', 'product_url', 'category', 'default_group', 'source_target', 'cloud_flatform', 'session_id', 'session_name', 'status_name', 'status_type', 'sessions', 'product_id', 'load datetime']


In [0]:
datatypes = [field.dataType for field in df.schema.fields]
print(datatypes)

[DateType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), IntegerType(), IntegerType(), StringType()]


In [0]:
fiels_nullable = [field.nullable for field in df.schema.fields]
print(fiels_nullable)

[True, True, True, True, True, True, True, True, True, True, True, True, True]


**1) Use df.schema.fields for Conditional Checks**

In [0]:
# Check if a specific column exists using df.schema.fields
if any(field.name == "department" for field in df.schema.fields):
    print("\n Column 'department' exists in DataFrame")
else:
    print("\n Column 'department' not found")

if any(field.name == "bonus" for field in df.schema.fields):
    print(" Column 'bonus' exists in DataFrame")
else:
    print(" Column 'bonus' not found")

if any(field.name == "category" for field in df.schema.fields):
    print(" Column 'category' exists in DataFrame")
else:
    print(" Column 'category' not found")

if any(field.name == "session_name" for field in df.schema.fields):
    print(" Column 'session_name' exists in DataFrame")
else:
    print(" Column 'session_name' not found")

if any(field.name == "product_id" for field in df.schema.fields):
    print(" Column 'product_id' exists in DataFrame")
else:
    print(" Column 'product_id' not found")


 Column 'department' not found
 Column 'bonus' not found
 Column 'category' exists in DataFrame
 Column 'session_name' exists in DataFrame
 Column 'product_id' exists in DataFrame


      (field.name == "category" for field in df.schema.fields)
- This generator expression loops through **each field** in **df.schema.fields** and checks:
  - “Is this field’s name **equal** to **category**?”
  - It produces a **sequence of Boolean values**: **[False, True, False]**
  - if only one of the columns (like "category") matches.

      if any(field.name == "category" for field in df.schema.fields):
- The built-in **any()** function returns:
  - **True** → if **at least one** of the conditions is **True**.
  - **False** → if **none** of the conditions is **True**.

In [0]:
# Create a list of column:datatype pairs
column_details = [(field.name, str(field.dataType)) for field in df.schema.fields]
print("\nColumn Details (ColumnName → DataType):", column_details)


Column Details (ColumnName → DataType): [('start_date', 'DateType()'), ('product_url', 'StringType()'), ('category', 'StringType()'), ('default_group', 'StringType()'), ('source_target', 'StringType()'), ('cloud_flatform', 'StringType()'), ('session_id', 'IntegerType()'), ('session_name', 'StringType()'), ('status_name', 'StringType()'), ('status_type', 'StringType()'), ('sessions', 'IntegerType()'), ('product_id', 'IntegerType()'), ('load datetime', 'StringType()')]


**2) Loop through Schema Fields and Display Details**

In [0]:
print("DataFrame Schema Details:\n")
print("-" * 50)
print("{:<22} {:<16} {:<10}".format("Column Name", "Data Type", "Nullable"))
print("-" * 50)

for field in df.schema.fields:
    print("{:<20} {:<20} {:<15}".format(field.name, str(field.dataType), str(field.nullable)))

print("x" * 50)

DataFrame Schema Details:

--------------------------------------------------
Column Name            Data Type        Nullable  
--------------------------------------------------
start_date           DateType()           True           
product_url          StringType()         True           
category             StringType()         True           
default_group        StringType()         True           
source_target        StringType()         True           
cloud_flatform       StringType()         True           
session_id           IntegerType()        True           
session_name         StringType()         True           
status_name          StringType()         True           
status_type          StringType()         True           
sessions             IntegerType()        True           
product_id           IntegerType()        True           
load datetime        StringType()         True           
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


**3) Access Specific Attributes**

In [0]:
# Example: Access the first field
field = df.schema.fields[0]

print("Column Name  :", field.name)
print("Data Type    :", field.dataType)
print("Nullable     :", field.nullable)

Column Name  : start_date
Data Type    : DateType()
Nullable     : True


**4) Extract Schema Information into a Pandas Table**

In [0]:
import pandas as pd

schema_info = [(field.name, str(field.dataType), field.nullable) for field in df.schema.fields]
schema_df = pd.DataFrame(schema_info, columns=["Column Name", "Data Type", "Nullable"])

schema_df

Unnamed: 0,Column Name,Data Type,Nullable
0,start_date,DateType(),True
1,product_url,StringType(),True
2,category,StringType(),True
3,default_group,StringType(),True
4,source_target,StringType(),True
5,cloud_flatform,StringType(),True
6,session_id,IntegerType(),True
7,session_name,StringType(),True
8,status_name,StringType(),True
9,status_type,StringType(),True
