**Key-Value Pairs**
- DataFrame with a column containing **JSON strings** representing **key-value pairs**.

#### **How to convert string type columns into map type?**

1) JSON Structure

2) Nested JSON Structure

3) Handling Null Values

4) Realtime Scenario

In [0]:
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, MapType
from pyspark.sql.functions import from_json, col, coalesce, lit

#### **1) JSON Structure**

In [0]:
# Sample DataFrame with a StringType column containing JSON strings
data = [("Naresh", '{"Age": 25, "emp_id": 768954, "Exp": 5}'), 
        ("Harish", '{"Age": 30, "emp_id": 768956, "Exp": 2}'),
        ("Prem", '{"Age": 28, "emp_id": 798954, "Exp": 8}'), 
        ("Prabhav", '{"Age": 35, "emp_id": 788956, "Exp": 6}'),
        ("Hari", '{"Age": 21, "emp_id": 769954, "Exp": 9}'), 
        ("Druv", '{"Age": 36, "emp_id": 768946, "Exp": 4}'),
        ]

schema = ["Student_Name", "Properties"]

# Convert the StringType column to a MapType column
df_json = spark.createDataFrame(data, schema)

# Display the resulting DataFrame
display(df_json)

Student_Name,Properties
Naresh,"{""Age"": 25, ""emp_id"": 768954, ""Exp"": 5}"
Harish,"{""Age"": 30, ""emp_id"": 768956, ""Exp"": 2}"
Prem,"{""Age"": 28, ""emp_id"": 798954, ""Exp"": 8}"
Prabhav,"{""Age"": 35, ""emp_id"": 788956, ""Exp"": 6}"
Hari,"{""Age"": 21, ""emp_id"": 769954, ""Exp"": 9}"
Druv,"{""Age"": 36, ""emp_id"": 768946, ""Exp"": 4}"


In [0]:
# Define the schema for the MapType column
map_schema = MapType(StringType(), IntegerType())

# Convert the StringType column to a MapType column
df_json = df_json.withColumn("json_map", from_json(col("Properties"), map_schema))

# Display the resulting DataFrame
display(df_json)

Student_Name,Properties,json_map
Naresh,"{""Age"": 25, ""emp_id"": 768954, ""Exp"": 5}","Map(Age -> 25, emp_id -> 768954, Exp -> 5)"
Harish,"{""Age"": 30, ""emp_id"": 768956, ""Exp"": 2}","Map(Age -> 30, emp_id -> 768956, Exp -> 2)"
Prem,"{""Age"": 28, ""emp_id"": 798954, ""Exp"": 8}","Map(Age -> 28, emp_id -> 798954, Exp -> 8)"
Prabhav,"{""Age"": 35, ""emp_id"": 788956, ""Exp"": 6}","Map(Age -> 35, emp_id -> 788956, Exp -> 6)"
Hari,"{""Age"": 21, ""emp_id"": 769954, ""Exp"": 9}","Map(Age -> 21, emp_id -> 769954, Exp -> 9)"
Druv,"{""Age"": 36, ""emp_id"": 768946, ""Exp"": 4}","Map(Age -> 36, emp_id -> 768946, Exp -> 4)"


In [0]:
df_col = df_json.withColumn("age", df_json.json_map.Age)\
  .withColumn("Emp_ID", df_json.json_map.emp_id)\
  .withColumn("Exp", df_json.json_map.Exp)

display(df_col)

Student_Name,Properties,json_map,age,Emp_ID,Exp
Naresh,"{""Age"": 25, ""emp_id"": 768954, ""Exp"": 5}","Map(Age -> 25, emp_id -> 768954, Exp -> 5)",25,768954,5
Harish,"{""Age"": 30, ""emp_id"": 768956, ""Exp"": 2}","Map(Age -> 30, emp_id -> 768956, Exp -> 2)",30,768956,2
Prem,"{""Age"": 28, ""emp_id"": 798954, ""Exp"": 8}","Map(Age -> 28, emp_id -> 798954, Exp -> 8)",28,798954,8
Prabhav,"{""Age"": 35, ""emp_id"": 788956, ""Exp"": 6}","Map(Age -> 35, emp_id -> 788956, Exp -> 6)",35,788956,6
Hari,"{""Age"": 21, ""emp_id"": 769954, ""Exp"": 9}","Map(Age -> 21, emp_id -> 769954, Exp -> 9)",21,769954,9
Druv,"{""Age"": 36, ""emp_id"": 768946, ""Exp"": 4}","Map(Age -> 36, emp_id -> 768946, Exp -> 4)",36,768946,4


In [0]:
# Sample DataFrame with a StringType column containing JSON strings
data = [("Naresh", '{"Age": 25, "emp_id": 768954, "Exp": 5}'), 
        ("Harish", '{"Age": 30, "emp_id": "768956", "Exp": 2}'),
        ("Prem", '{"Age": 28, "emp_id": 798954, "Exp": 8}'), 
        ("Prabhav", '{"Age": 35, "emp_id": 788956, "Exp": "6"}'),
        ("Hari", '{"Age": 21, "emp_id": "769954", "Exp": 9}'), 
        ("Druv", '{"Age": 36, "emp_id": 768946, "Exp": 4}'),
        ]

schema = ["Student_Name", "Properties"]

# Convert the StringType column to a MapType column
df_json1 = spark.createDataFrame(data, schema)

# Define the schema for the MapType column
map_schema = MapType(StringType(), IntegerType())

# Convert the StringType column to a MapType column
df_json1 = df_json1.withColumn("json_map", from_json(col("Properties"), map_schema))

# Display the resulting DataFrame
display(df_json1)

Student_Name,Properties,json_map
Naresh,"{""Age"": 25, ""emp_id"": 768954, ""Exp"": 5}","Map(Age -> 25, emp_id -> 768954, Exp -> 5)"
Harish,"{""Age"": 30, ""emp_id"": ""768956"", ""Exp"": 2}",
Prem,"{""Age"": 28, ""emp_id"": 798954, ""Exp"": 8}","Map(Age -> 28, emp_id -> 798954, Exp -> 8)"
Prabhav,"{""Age"": 35, ""emp_id"": 788956, ""Exp"": ""6""}",
Hari,"{""Age"": 21, ""emp_id"": ""769954"", ""Exp"": 9}",
Druv,"{""Age"": 36, ""emp_id"": 768946, ""Exp"": 4}","Map(Age -> 36, emp_id -> 768946, Exp -> 4)"


#### **2) Nested JSON Structure**
- DataFrame with a column containing **JSON strings** representing **nested key-value pairs**.

In [0]:
# Sample DataFrame
data = [("1", '{"Name": "Hari", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "DE"}}'), 
        ("2", '{"Name": "Narahari", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "DS"}}'),
        ("3", '{"Name": "Venu", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Engineer"}}'),
        ("4", '{"Name": "Giri", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Admin"}}'), 
        ("5", '{"Name": "Sree", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation":"Developer"}}'),
        ("6", '{"Name": "Anu", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Testing"}}'),
        ("7", '{"Name": "Devi", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Modeler"}}'), 
        ("8", '{"Name": "Kedar", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Sales"}}'),
        ("9", '{"Name": "Smith", "map": {"Country": "India", "City": "Delhi", "Level": "Manager", "Designation": "Executive"}}')
        ]
schema = ["id", "Profile"]

# Convert the StringType column to a MapType column
df_nest_json = spark.createDataFrame(data, schema)

# Display the DataFrame
display(df_nest_json)

id,Profile
1,"{""Name"": ""Hari"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""DE""}}"
2,"{""Name"": ""Narahari"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""DS""}}"
3,"{""Name"": ""Venu"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Engineer""}}"
4,"{""Name"": ""Giri"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Admin""}}"
5,"{""Name"": ""Sree"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"":""Developer""}}"
6,"{""Name"": ""Anu"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Testing""}}"
7,"{""Name"": ""Devi"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Modeler""}}"
8,"{""Name"": ""Kedar"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Sales""}}"
9,"{""Name"": ""Smith"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Executive""}}"


In [0]:
# Define the schema for the nested JSON structure
nested_map_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("map", StructType([
        StructField("Country", StringType(), True),
        StructField("City", StringType(), True),
        StructField("Level", StringType(), True),
        StructField("Designation", StringType(), True)
    ]), True)
])

# Convert the JSON string to nested MapType
df_nest_json = df_nest_json.withColumn("json_nest_map", from_json(col("Profile"), nested_map_schema))

# Display the DataFrame
display(df_nest_json)

id,Profile,json_nest_map
1,"{""Name"": ""Hari"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""DE""}}","List(Hari, List(India, Delhi, Manager, DE))"
2,"{""Name"": ""Narahari"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""DS""}}","List(Narahari, List(India, Delhi, Manager, DS))"
3,"{""Name"": ""Venu"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Engineer""}}","List(Venu, List(India, Delhi, Manager, Engineer))"
4,"{""Name"": ""Giri"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Admin""}}","List(Giri, List(India, Delhi, Manager, Admin))"
5,"{""Name"": ""Sree"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"":""Developer""}}","List(Sree, List(India, Delhi, Manager, Developer))"
6,"{""Name"": ""Anu"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Testing""}}","List(Anu, List(India, Delhi, Manager, Testing))"
7,"{""Name"": ""Devi"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Modeler""}}","List(Devi, List(India, Delhi, Manager, Modeler))"
8,"{""Name"": ""Kedar"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Sales""}}","List(Kedar, List(India, Delhi, Manager, Sales))"
9,"{""Name"": ""Smith"", ""map"": {""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager"", ""Designation"": ""Executive""}}","List(Smith, List(India, Delhi, Manager, Executive))"


In [0]:
df_nest_json_col = df_nest_json.withColumn("name", df_nest_json.json_nest_map.Name)\
  .withColumn("country", df_nest_json.json_nest_map.map.Country)\
  .withColumn("city", df_nest_json.json_nest_map.map.City)\
  .withColumn("level", df_nest_json.json_nest_map.map.Level)\
  .withColumn("designation", df_nest_json.json_nest_map.map.Designation)\
    .drop('id', 'Profile')

display(df_nest_json_col)

json_nest_map,name,country,city,level,designation
"List(Hari, List(India, Delhi, Manager, DE))",Hari,India,Delhi,Manager,DE
"List(Narahari, List(India, Delhi, Manager, DS))",Narahari,India,Delhi,Manager,DS
"List(Venu, List(India, Delhi, Manager, Engineer))",Venu,India,Delhi,Manager,Engineer
"List(Giri, List(India, Delhi, Manager, Admin))",Giri,India,Delhi,Manager,Admin
"List(Sree, List(India, Delhi, Manager, Developer))",Sree,India,Delhi,Manager,Developer
"List(Anu, List(India, Delhi, Manager, Testing))",Anu,India,Delhi,Manager,Testing
"List(Devi, List(India, Delhi, Manager, Modeler))",Devi,India,Delhi,Manager,Modeler
"List(Kedar, List(India, Delhi, Manager, Sales))",Kedar,India,Delhi,Manager,Sales
"List(Smith, List(India, Delhi, Manager, Executive))",Smith,India,Delhi,Manager,Executive


#### **3) Handling Null Values**
- DataFrame with a column containing **JSON strings**, some of which might be **null**.

In [0]:
# Sample DataFrame with a StringType column containing JSON strings
data = [("Naresh", '{"Country": "India", "City": "Delhi", "Level": "Manager"}'), 
        ("Harish", '{"Country": "USA", "City": "New York", "Level": "SrManager"}'),
        ("Prem", '{"Country": "UK", "City": "London", "Level": "GM"}'), 
        ("Prabhav", '{"Country": "Norway", "City": "Norths", "Level": "Executive"}'),
        ("Hari", '{"Country": "Sweden", "City": "Stockholm", "Level": "SrExecutive"}'), 
        ("Druv", None)
        ]

schema = ["Name", "Profile"]

# Convert the StringType column to a MapType column
df_null = spark.createDataFrame(data, schema)

# Display the DataFrame
display(df_null)

Name,Profile
Naresh,"{""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager""}"
Harish,"{""Country"": ""USA"", ""City"": ""New York"", ""Level"": ""SrManager""}"
Prem,"{""Country"": ""UK"", ""City"": ""London"", ""Level"": ""GM""}"
Prabhav,"{""Country"": ""Norway"", ""City"": ""Norths"", ""Level"": ""Executive""}"
Hari,"{""Country"": ""Sweden"", ""City"": ""Stockholm"", ""Level"": ""SrExecutive""}"
Druv,


In [0]:
# Define the schema for the map
map_schema = MapType(StringType(), StringType())

# Convert the JSON string to MapType, handling null values
df_null = df_null.withColumn("json_null", from_json(col("Profile"), map_schema))

# Display the DataFrame
display(df_null)

Name,Profile,json_null
Naresh,"{""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager""}","Map(Country -> India, City -> Delhi, Level -> Manager)"
Harish,"{""Country"": ""USA"", ""City"": ""New York"", ""Level"": ""SrManager""}","Map(Country -> USA, City -> New York, Level -> SrManager)"
Prem,"{""Country"": ""UK"", ""City"": ""London"", ""Level"": ""GM""}","Map(Country -> UK, City -> London, Level -> GM)"
Prabhav,"{""Country"": ""Norway"", ""City"": ""Norths"", ""Level"": ""Executive""}","Map(Country -> Norway, City -> Norths, Level -> Executive)"
Hari,"{""Country"": ""Sweden"", ""City"": ""Stockholm"", ""Level"": ""SrExecutive""}","Map(Country -> Sweden, City -> Stockholm, Level -> SrExecutive)"
Druv,,


In [0]:
# Convert the JSON string to MapType, handling null values
# coalesce function to replace any null values in the json_str column with an empty JSON object ('{}')
df_null_lit = df_null.withColumn("json_null", from_json(coalesce(col("Profile"), lit('{}')), map_schema))

# Display the DataFrame
display(df_null_lit)

Name,Profile,json_null
Naresh,"{""Country"": ""India"", ""City"": ""Delhi"", ""Level"": ""Manager""}","Map(Country -> India, City -> Delhi, Level -> Manager)"
Harish,"{""Country"": ""USA"", ""City"": ""New York"", ""Level"": ""SrManager""}","Map(Country -> USA, City -> New York, Level -> SrManager)"
Prem,"{""Country"": ""UK"", ""City"": ""London"", ""Level"": ""GM""}","Map(Country -> UK, City -> London, Level -> GM)"
Prabhav,"{""Country"": ""Norway"", ""City"": ""Norths"", ""Level"": ""Executive""}","Map(Country -> Norway, City -> Norths, Level -> Executive)"
Hari,"{""Country"": ""Sweden"", ""City"": ""Stockholm"", ""Level"": ""SrExecutive""}","Map(Country -> Sweden, City -> Stockholm, Level -> SrExecutive)"
Druv,,Map()


#### **4) Scenario**
- **Source files:** CSV & AVRO schema
- **Requirement:** Convert string data type to map type

**AVRO Schema for Cust_Metadata**

     {
       "name": "Cust_Metadata",
       "type": [
         "null",
         {
           "type": "map",
           "values": "string"
         }
       ],
       "doc": "additional key value pair, e.g Cust_Subgroup.",
       "default": null
     }

**AVRO Schema for Price_Metadata**

     {
       "name": "Price_Metadata",
       "type": [
         "null",
         {
           "type": "map",
           "values": "string"
         }
       ],
       "doc": "additional key value pair, e.g Company_Name, Category, Location & Cust_Type.",
       "default": null
     }

**AVRO Schema for Additional_Metadata**

     {
       "name": "Additional_Metadata",
       "type": [
         "null",
         {
           "type": "map",
           "values": "string"
         }
       ],
       "doc": "additional key value pair, e.g Cust_Category.",
       "default": null
     }

In [0]:
df = spark.read.csv("/FileStore/tables/StringToMaptype-1.csv", header=True, inferSchema=True)
display(df.limit(10))

Company_Name,Cust_Id,Cust_Name,Category,Start_Date,Start_Cust_Date,End_Date,Updated_Date,Cust_Value,Cust_Type,Exchange,Location,Last_Date_UTC,Cust_Category,Index,Cust_Subgroup,Cust_Metadata,Price_Metadata,Additional_Metadata
Sony,20,Naresh,Standard,3-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,30,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
Sony,21,kamal,Standard,6-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,25,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,22,kajal,Standard,9-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,28,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,23,kiran,Standard,3-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,31,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,24,sam,Standard,8-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,34,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,25,sourab,Standard,9-Jan-24,1730000000000.0,1740000000000.0,1730000000000.0,37,STD,EUR,IND,1720000000000.0,TOI,True,Engineer,,,
Sony,26,jai,Upper,3-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,40,STD,EUR,IND,1720000000000.0,TOI,True,Engineer,,,
BPL,27,sree,Upper,6-Mar-23,1730000000000.0,1730000000000.0,1730000000000.0,43,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
BPL,28,sreenath,Upper,9-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,46,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
BPL,29,kamaesh,Upper,3-Jan-25,1740000000000.0,1740000000000.0,1730000000000.0,49,STD,EUR,IND,1720000000000.0,SETTL,False,SrEngineer,,,


In [0]:
# Convert string type (Cust_Metadata, Price_Metadata & Additional_Metadata) to a map type
df_str_map = df\
    .withColumn("Cust_Metadata", from_json(col("Cust_Metadata"), MapType(StringType(), StringType())))\
    .withColumn("Price_Metadata", from_json(col("Price_Metadata"), MapType(StringType(), StringType())))\
    .withColumn("Additional_Metadata", from_json(col("Additional_Metadata"), MapType(StringType(), StringType())))

display(df_str_map.limit(10))

Company_Name,Cust_Id,Cust_Name,Category,Start_Date,Start_Cust_Date,End_Date,Updated_Date,Cust_Value,Cust_Type,Exchange,Location,Last_Date_UTC,Cust_Category,Index,Cust_Subgroup,Cust_Metadata,Price_Metadata,Additional_Metadata
Sony,20,Naresh,Standard,3-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,30,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
Sony,21,kamal,Standard,6-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,25,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,22,kajal,Standard,9-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,28,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,23,kiran,Standard,3-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,31,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,24,sam,Standard,8-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,34,STD,EUR,IND,1720000000000.0,TOI,False,Engineer,,,
Sony,25,sourab,Standard,9-Jan-24,1730000000000.0,1740000000000.0,1730000000000.0,37,STD,EUR,IND,1720000000000.0,TOI,True,Engineer,,,
Sony,26,jai,Upper,3-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,40,STD,EUR,IND,1720000000000.0,TOI,True,Engineer,,,
BPL,27,sree,Upper,6-Mar-23,1730000000000.0,1730000000000.0,1730000000000.0,43,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
BPL,28,sreenath,Upper,9-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,46,STD,EUR,IND,1720000000000.0,SETTL,True,Engineer,,,
BPL,29,kamaesh,Upper,3-Jan-25,1740000000000.0,1740000000000.0,1730000000000.0,49,STD,EUR,IND,1720000000000.0,SETTL,False,SrEngineer,,,
