#### concat()
- is used to `concatenate multiple columns` into a `single column`.
- It `merges` **multiple columns or literals** into a `single output column`.
- If `any column` is `null`, the `entire result` becomes `null`. 

##### Syntax

     concat(*cols)

- `*cols : (string or Column)`
- `One or more column names or column expressions` to `concatenate`.
- **Returns** a `single string column` that `joins` all specified input `columns or string` expressions `without any separator`.

##### 1) Concatenate two string columns
- `Concatenate` columns `with / without` a `separator`.
  - concat() `does not` add `spaces` automatically.

In [0]:
# Sample data
data = [(1, 101, "Nitin", "Kumar", "Bangalore", 26),
        (2, 102, "Rakesh", "Rathod", None, 29),
        (3, 103, "Swapna", "Kumari", "Hyderabad", 32),
        (4, 104, "Pavan", "Rao", "Pune", 25),
        (5, 105, "Krishna", "Kishore", "Mumbai", 27),
        (6, 106, "Rahul", "Sen", "Nasik", 34),
        (7, 107, "Kaniska", "Shetty", "None", 35),
        (8, 108, "Bishop", "Cotton", None, 24),
        (9, 109, "Sundar", "Das", None, 35),
        (1, 110, "Rajesh", "Sharma", "Delhi", 30)
       ]

columns = ["ID", "EmpId", "first_name", "last_name", "City", "Age"]

# Create DataFrame
df_emp = spark.createDataFrame(data, columns)
display(df_emp)

EmpId,first_name,last_name,City,Age
101,Nitin,Kumar,Bangalore,26
102,Rakesh,Rathod,,29
103,Swapna,Kumari,Hyderabad,32
104,Pavan,Rao,Pune,25
105,Krishna,Kishore,Mumbai,27
106,Rahul,Sen,Nasik,34
107,Kaniska,Shetty,,35
108,Bishop,Cotton,,24
109,Sundar,Das,,35
110,Rajesh,Sharma,Delhi,30


In [0]:
from pyspark.sql.functions import concat, lit

In [0]:
from pyspark.sql import functions as F
src = df_emp.withColumn('column_modified', F.concat(F.lit("000"), F.col('ID'))) \
            .select("ID", "column_modified").display()

In [0]:
df_concat_str = (df_emp.withColumn("full_name", concat(col("first_name"), col("last_name")))
    .withColumn("full_name_space", concat(col("first_name"), lit(" "), col("last_name")))          # Concatenate columns without a separator                      
    .withColumn("full_name_comma", concat(col("first_name"), lit(" , "), col("last_name")))        # Concatenate columns with a literal separator (comma, space and back space)
    .withColumn("full_name_backslash", concat(col("first_name"), lit(" / "), col("last_name")))    # Concatenate columns with a literal separator (comma, space and back space)
    .select("first_name",
            "last_name",
            "full_name",
            "full_name_space",
            "full_name_comma",
            "full_name_backslash"))

display(df_concat_str)

first_name,last_name,full_name,full_name_space,full_name_comma,full_name_backslash
Nitin,Kumar,NitinKumar,Nitin Kumar,"Nitin , Kumar",Nitin / Kumar
Rakesh,Rathod,RakeshRathod,Rakesh Rathod,"Rakesh , Rathod",Rakesh / Rathod
Swapna,Kumari,SwapnaKumari,Swapna Kumari,"Swapna , Kumari",Swapna / Kumari
Pavan,Rao,PavanRao,Pavan Rao,"Pavan , Rao",Pavan / Rao
Krishna,Kishore,KrishnaKishore,Krishna Kishore,"Krishna , Kishore",Krishna / Kishore
Rahul,Sen,RahulSen,Rahul Sen,"Rahul , Sen",Rahul / Sen
Kaniska,Shetty,KaniskaShetty,Kaniska Shetty,"Kaniska , Shetty",Kaniska / Shetty
Bishop,Cotton,BishopCotton,Bishop Cotton,"Bishop , Cotton",Bishop / Cotton
Sundar,Das,SundarDas,Sundar Das,"Sundar , Das",Sundar / Das
Rajesh,Sharma,RajeshSharma,Rajesh Sharma,"Rajesh , Sharma",Rajesh / Sharma


In [0]:
# Sample Data
data = [
    ('Sunitha', '', 'Smith', '1991-04-01', 'M', 3000),
    ('Mahesh', 'Rathod', '', '2000-05-19', 'M', 4000),
    ('Roshan', '', 'Kumar', '1978-09-05', 'M', 4000),
    ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000),
    ('Mrs.', 'Suhash', 'Gupta', '1980-02-17', 'F', -1)
]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df_mlt = spark.createDataFrame(data=data, schema=columns)
display(df_mlt)

In [0]:
# Concatenate multiple columns
df_concat_mlt = df_mlt.select(
    concat(df.firstname, df.middlename, df.lastname).alias("FullName"),
    "dob", "gender", "salary"
)
display(df_concat_mlt)

##### 2) Concatenate with space
- concat() `does not` add `spaces` automatically.

In [0]:
# Sample data
data = [
    (101, "Nitin", "Kumar", "IPL-001", 101, 1, 0, 10, 1001, "2025-01-01 10:00:00"),
    (102, "Rakesh", "Rathod", "IPL-002", 102, 2, 1, 11, 1002, "2025-02-15 15:30:00"),
    (103, "Swapna", "Kumari", "IPL-003", 103, 1, 2, 12, 1003, "2025-03-20 09:45:00"),
    (104, "Pavan", "Rao", "IPL-004", 104, 3, 0, 13, 1004, "2025-04-11 10:00:00"),
    (105, "Krishna", "Kishore", "IPL-005", 105, 4, 1, 14, 1005, "2021-09-25 18:39:00"),
    (106, "Rahul", "Sen", "IPL-006", 106, 6, 2, 15, 1006, "2024-06-29 02:45:00"),
    (107, "Kaniska", "Shetty", "IPL-007", 107, 9, 0, 16, 1007, "2022-08-30 19:45:00"),
    (108, "Bishop", "Cotton", "IPL-008", 108, 2, 1, 17, 1008, "2023-11-17 25:30:00"),
    (109, "Sundar", "Das", "IPL-009", 109, 7, 2, 18, 1009, "2024-12-19 29:45:00"),
    (110, "Rajesh", "Sharma", "IPL-010", 109, 7, 2, 18, 1009, "2024-12-19 29:45:00")
]

columns = ["EmpId", "first_name", "last_name", "event_id", "make_id", "version_id", "item_version_id", "sales_id", "vehicle_id", "start_timestamp"]

# Create DataFrame
df_concat = spark.createDataFrame(data, columns)
display(df_concat)

EmpId,first_name,last_name,event_id,make_id,version_id,item_version_id,sales_id,vehicle_id,start_timestamp
101,Nitin,Kumar,IPL-001,101,1,0,10,1001,2025-01-01 10:00:00
102,Rakesh,Rathod,IPL-002,102,2,1,11,1002,2025-02-15 15:30:00
103,Swapna,Kumari,IPL-003,103,1,2,12,1003,2025-03-20 09:45:00
104,Pavan,Rao,IPL-004,104,3,0,13,1004,2025-04-11 10:00:00
105,Krishna,Kishore,IPL-005,105,4,1,14,1005,2021-09-25 18:39:00
106,Rahul,Sen,IPL-006,106,6,2,15,1006,2024-06-29 02:45:00
107,Kaniska,Shetty,IPL-007,107,9,0,16,1007,2022-08-30 19:45:00
108,Bishop,Cotton,IPL-008,108,2,1,17,1008,2023-11-17 25:30:00
109,Sundar,Das,IPL-009,109,7,2,18,1009,2024-12-19 29:45:00
110,Rajesh,Sharma,IPL-010,109,7,2,18,1009,2024-12-19 29:45:00


In [0]:
# Create surrogate_key column using concat and lit
df_concat_lit = df_concat.select("*",
    concat(
        col("event_id"), lit("-"),
        col("make_id"), lit("-"),
        col("version_id"), lit("-"),
        col("item_version_id"), lit("-"),
        col("sales_id"), lit("-"),
        col("vehicle_id"), lit("-"),
        col("start_timestamp")
    ).alias("surrogate_key")
)

# Show result
display(df_concat_lit)

EmpId,first_name,last_name,event_id,make_id,version_id,item_version_id,sales_id,vehicle_id,start_timestamp,surrogate_key
101,Nitin,Kumar,IPL-001,101,1,0,10,1001,2025-01-01 10:00:00,IPL-001-101-1-0-10-1001-2025-01-01 10:00:00
102,Rakesh,Rathod,IPL-002,102,2,1,11,1002,2025-02-15 15:30:00,IPL-002-102-2-1-11-1002-2025-02-15 15:30:00
103,Swapna,Kumari,IPL-003,103,1,2,12,1003,2025-03-20 09:45:00,IPL-003-103-1-2-12-1003-2025-03-20 09:45:00
104,Pavan,Rao,IPL-004,104,3,0,13,1004,2025-04-11 10:00:00,IPL-004-104-3-0-13-1004-2025-04-11 10:00:00
105,Krishna,Kishore,IPL-005,105,4,1,14,1005,2021-09-25 18:39:00,IPL-005-105-4-1-14-1005-2021-09-25 18:39:00
106,Rahul,Sen,IPL-006,106,6,2,15,1006,2024-06-29 02:45:00,IPL-006-106-6-2-15-1006-2024-06-29 02:45:00
107,Kaniska,Shetty,IPL-007,107,9,0,16,1007,2022-08-30 19:45:00,IPL-007-107-9-0-16-1007-2022-08-30 19:45:00
108,Bishop,Cotton,IPL-008,108,2,1,17,1008,2023-11-17 25:30:00,IPL-008-108-2-1-17-1008-2023-11-17 25:30:00
109,Sundar,Das,IPL-009,109,7,2,18,1009,2024-12-19 29:45:00,IPL-009-109-7-2-18-1009-2024-12-19 29:45:00
110,Rajesh,Sharma,IPL-010,109,7,2,18,1009,2024-12-19 29:45:00,IPL-010-109-7-2-18-1009-2024-12-19 29:45:00


##### 3) Concatenate string + numeric column
- `Numeric columns` must be `cast to string`.

In [0]:
data = [
    (1, "Laptop", 50000, 7),
    (2, "Mobile", 20000, 5),
    (3, "Tablet", 30000, 8),
    (4, "Headphones", 10000, 2),
    (5, "Monitor", 80000, 4),
    (6, "Keyboard", 5000, 3),
    (7, "Mouse", 3000, 6),
    (8, "Speaker", 7000, 8)
]

df_concat_str_int = spark.createDataFrame(data, ["ID", "product", "price", "quantity"])
display(df_concat_str_int)

ID,product,price,quantity
1,Laptop,50000,7
2,Mobile,20000,5
3,Tablet,30000,8
4,Headphones,10000,2
5,Monitor,80000,4
6,Keyboard,5000,3
7,Mouse,3000,6
8,Speaker,7000,8


In [0]:
df3 = df_concat_str_int.withColumn(
    "product_price",
    concat(col("product"), lit("; "), col("price").cast("string"))
)

display(df3)

ID,product,price,quantity,product_price
1,Laptop,50000,7,Laptop; 50000
2,Mobile,20000,5,Mobile; 20000
3,Tablet,30000,8,Tablet; 30000
4,Headphones,10000,2,Headphones; 10000
5,Monitor,80000,4,Monitor; 80000
6,Keyboard,5000,3,Keyboard; 5000
7,Mouse,3000,6,Mouse; 3000
8,Speaker,7000,8,Speaker; 7000


##### 4) Concatenate date / timestamp columns
- `Date or timestamp` must be `cast to string`.

In [0]:
from pyspark.sql.functions import date_format

df4 = df_concat.withColumn(
    "name_with_time",
    concat(
        lit("load_date"),
        lit(" | "),
        date_format(to_timestamp(lit("2026-01-01 10:00:00"), "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd'T'HH:mm:ss:SSSXXX")
    )
)

df4.select("first_name", "start_timestamp", "name_with_time").display()

first_name,start_timestamp,name_with_time
Nitin,2025-01-01 10:00:00,load_date | 2026-01-01T10:00:00:000Z
Rakesh,2025-02-15 15:30:00,load_date | 2026-01-01T10:00:00:000Z
Swapna,2025-03-20 09:45:00,load_date | 2026-01-01T10:00:00:000Z
Pavan,2025-04-11 10:00:00,load_date | 2026-01-01T10:00:00:000Z
Krishna,2021-09-25 18:39:00,load_date | 2026-01-01T10:00:00:000Z
Rahul,2024-06-29 02:45:00,load_date | 2026-01-01T10:00:00:000Z
Kaniska,2022-08-30 19:45:00,load_date | 2026-01-01T10:00:00:000Z
Bishop,2023-11-17 25:30:00,load_date | 2026-01-01T10:00:00:000Z
Sundar,2024-12-19 29:45:00,load_date | 2026-01-01T10:00:00:000Z
Rajesh,2024-12-19 29:45:00,load_date | 2026-01-01T10:00:00:000Z


##### 5) Concatenate multiple columns (more than 2)

In [0]:
df5 = df_concat.withColumn(
    "combined",
    concat(col("first_name"), lit("-"), col("last_name"), lit("-ID"))
)

display(df5)

EmpId,first_name,last_name,event_id,make_id,version_id,item_version_id,sales_id,vehicle_id,start_timestamp,combined
101,Nitin,Kumar,IPL-001,101,1,0,10,1001,2025-01-01 10:00:00,Nitin-Kumar-ID
102,Rakesh,Rathod,IPL-002,102,2,1,11,1002,2025-02-15 15:30:00,Rakesh-Rathod-ID
103,Swapna,Kumari,IPL-003,103,1,2,12,1003,2025-03-20 09:45:00,Swapna-Kumari-ID
104,Pavan,Rao,IPL-004,104,3,0,13,1004,2025-04-11 10:00:00,Pavan-Rao-ID
105,Krishna,Kishore,IPL-005,105,4,1,14,1005,2021-09-25 18:39:00,Krishna-Kishore-ID
106,Rahul,Sen,IPL-006,106,6,2,15,1006,2024-06-29 02:45:00,Rahul-Sen-ID
107,Kaniska,Shetty,IPL-007,107,9,0,16,1007,2022-08-30 19:45:00,Kaniska-Shetty-ID
108,Bishop,Cotton,IPL-008,108,2,1,17,1008,2023-11-17 25:30:00,Bishop-Cotton-ID
109,Sundar,Das,IPL-009,109,7,2,18,1009,2024-12-19 29:45:00,Sundar-Das-ID
110,Rajesh,Sharma,IPL-010,109,7,2,18,1009,2024-12-19 29:45:00,Rajesh-Sharma-ID


##### 6) Handling NULL values in concat()

- If any `column is NULL`, concat() returns `NULL`.

In [0]:
# Sample data
data = [(101, "Nitin", "Kumar", "Bangalore", 26),
        (102, "", "", None, 29),
        (103, "", "Kumari", "Hyderabad", 32),
        (104, "Pavan", "Rao", "Pune", 25),
        (105, "Krishna", "", "Mumbai", 27),
        (106, None, "Sen", "Nasik", 34),
        (107, "Kaniska", "Shetty", "", 35),
        (108, None, "Cotton", None, 24),
        (109, "Sundar", "Das", None, 35),
        (110, None, None, "Delhi", 30)
       ]

columns = ["EmpId", "first_name", "last_name", "City", "Age"]

# Create DataFrame
df_emp_null = spark.createDataFrame(data, columns)
display(df_emp_null)

EmpId,first_name,last_name,City,Age
101,Nitin,Kumar,Bangalore,26
102,,,,29
103,,Kumari,Hyderabad,32
104,Pavan,Rao,Pune,25
105,Krishna,,Mumbai,27
106,,Sen,Nasik,34
107,Kaniska,Shetty,,35
108,,Cotton,,24
109,Sundar,Das,,35
110,,,Delhi,30


In [0]:
df_emp_null.withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name"))) \
           .select(["first_name", "last_name", "full_name"]).display()

first_name,last_name,full_name
Nitin,Kumar,Nitin Kumar
,,
,Kumari,Kumari
Pavan,Rao,Pavan Rao
Krishna,,Krishna
,Sen,
Kaniska,Shetty,Kaniska Shetty
,Cotton,
Sundar,Das,Sundar Das
,,


##### 7) Fix NULL issue using coalesce()

In [0]:
from pyspark.sql.functions import coalesce

df_fixed = df_emp_null.withColumn("full_name", concat(col("first_name"), lit(" "), col("last_name"))) \
                      .withColumn("full_name_null", concat(coalesce(col("first_name"), lit("")), lit(" "), coalesce(col("last_name"), lit("")))) \
                      .select("first_name", "last_name", "full_name", "full_name_null")

df_fixed.display()

first_name,last_name,full_name,full_name_null
Nitin,Kumar,Nitin Kumar,Nitin Kumar
,,,
,Kumari,Kumari,Kumari
Pavan,Rao,Pavan Rao,Pavan Rao
Krishna,,Krishna,Krishna
,Sen,,Sen
Kaniska,Shetty,Kaniska Shetty,Kaniska Shetty
,Cotton,,Cotton
Sundar,Das,Sundar Das,Sundar Das
,,,


##### concat() vs concat_ws()

| Feature      | `concat()`          | `concat_ws()`      |
| ------------ | ------------------- | ------------------ |
| Separator    | Manual (`lit(" ")`) | Automatic          |
| Handles NULL | ❌ returns NULL    | ✅ ignores NULL    |