**Problem Statement**

- You have a dataset containing **employee information**, where each employee may have **multiple technology experience**  stored in a **single column as an array**. Write a Pyspark code to **transform** this dataset so that each experience for each employee appears on a **separate row**.

**Solution**

In [0]:
# Import necessary libraries
from pyspark.sql.functions import col, explode

In [0]:
# Sample data
data = [(1, "Jayesh", "Tendulkar", 101, ['SQL','Data Science','PySpark']),
        (2, "Rohit", "Sharma", 102, ['Data Analytics','ML','AI']),
        (3, "Sai", "Ramesh", 101, ['SSMS','Azure','AWS','DEVOPS']),
        (4, "Sreedhar", "Arava", 102, ['Database','Oracle','ADF']),
        (5, "Somesh", "yadav", 101, ['SQL','Data Science','GitHub','PANDAS']),
        (6, "Radhika", "Gupta", 102, ['DEVOPS','AWS','SSMS','Python'])
       ]

columns = ["emp_id", "first_name", "last_name", "dept_id", "Technology"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
display(df)

# display data types
df.printSchema()

emp_id,first_name,last_name,dept_id,Technology
1,Jayesh,Tendulkar,101,"List(SQL, Data Science, PySpark)"
2,Rohit,Sharma,102,"List(Data Analytics, ML, AI)"
3,Sai,Ramesh,101,"List(SSMS, Azure, AWS, DEVOPS)"
4,Sreedhar,Arava,102,"List(Database, Oracle, ADF)"
5,Somesh,yadav,101,"List(SQL, Data Science, GitHub, PANDAS)"
6,Radhika,Gupta,102,"List(DEVOPS, AWS, SSMS, Python)"


root
 |-- emp_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dept_id: long (nullable = true)
 |-- Technology: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
# Explode "Technology" column
exp_df = df.withColumn("Domain", explode(df.Technology))

# Display the result
display(exp_df)

emp_id,first_name,last_name,dept_id,Technology,Domain
1,Jayesh,Tendulkar,101,"List(SQL, Data Science, PySpark)",SQL
1,Jayesh,Tendulkar,101,"List(SQL, Data Science, PySpark)",Data Science
1,Jayesh,Tendulkar,101,"List(SQL, Data Science, PySpark)",PySpark
2,Rohit,Sharma,102,"List(Data Analytics, ML, AI)",Data Analytics
2,Rohit,Sharma,102,"List(Data Analytics, ML, AI)",ML
2,Rohit,Sharma,102,"List(Data Analytics, ML, AI)",AI
3,Sai,Ramesh,101,"List(SSMS, Azure, AWS, DEVOPS)",SSMS
3,Sai,Ramesh,101,"List(SSMS, Azure, AWS, DEVOPS)",Azure
3,Sai,Ramesh,101,"List(SSMS, Azure, AWS, DEVOPS)",AWS
3,Sai,Ramesh,101,"List(SSMS, Azure, AWS, DEVOPS)",DEVOPS


In [0]:
# Explode "Technology" column
exploded_df = df.withColumn("Domain", explode(df.Technology)).drop("Technology")

# Display the result
display(exploded_df)

# display data types
df.printSchema()

emp_id,first_name,last_name,dept_id,Domain
1,Jayesh,Tendulkar,101,SQL
1,Jayesh,Tendulkar,101,Data Science
1,Jayesh,Tendulkar,101,PySpark
2,Rohit,Sharma,102,Data Analytics
2,Rohit,Sharma,102,ML
2,Rohit,Sharma,102,AI
3,Sai,Ramesh,101,SSMS
3,Sai,Ramesh,101,Azure
3,Sai,Ramesh,101,AWS
3,Sai,Ramesh,101,DEVOPS


root
 |-- emp_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dept_id: long (nullable = true)
 |-- Technology: array (nullable = true)
 |    |-- element: string (containsNull = true)

