#### **Question 01**

     We have multiple columns in data as below:
            column1   column2    column 3

     How to add new column `(column3)*2` in existing data set?
           output:
             column1   column2     column 3   new_column

In [0]:
from pyspark.sql.functions import col

# Sample data
data = [(1, 2, 3),
        (4, 5, 6),
        (7, 8, 9)]

# Define schema
columns = ["column1", "column2", "column3"]

# Create DataFrame
df_q3 = spark.createDataFrame(data, columns)
display(df_q3)

column1,column2,column3
1,2,3
4,5,6
7,8,9


In [0]:
# Add new column which is (column3) * 2
df_with_new_column = df_q3.withColumn("new_column", col("column3") * 2)

# Display the DataFrame
display(df_with_new_column)

column1,column2,column3,new_column
1,2,3,6
4,5,6,12
7,8,9,18


#### **Question 02**

      We have column as below order
            item_id       c1        c2
               1        [a,b,c]     d
      
      How to separate `c1` column as individual as below?
            item_id      c1        c2       c3        c4
               1         a         b        c         d

In [0]:
# Sample data
data = [
    (1, ["a", "b", "c"], "d")
]

# Define schema
columns = ["item_id", "c1", "c2"]

# Create DataFrame
df_Q4 = spark.createDataFrame(data, columns)
display(df_Q4)

item_id,c1,c2
1,"List(a, b, c)",d


In [0]:
from pyspark.sql.functions import split, col, explode, max, size

     # Use selectExpr to split the array into separate columns
     df_expr = df_Q4.selectExpr("item_id", "c1[0] as c1", "c1[1] as c2", "c1[2] as c3", "c2")
     display(df_expr)

In [0]:
# Split the c1 column into individual columns
df_split = df_Q4.withColumn("c1_split_0", df_Q4.c1[0]) \
                .withColumn("c1_split_1", df_Q4.c1[1]) \
                .withColumn("c1_split_2", df_Q4.c1[2]) \
                .withColumnRenamed("c2", "c4") \
                .drop('c1') \
                .withColumnRenamed("c1_split_0", "c1") \
                .withColumnRenamed("c1_split_1", "c2") \
                .withColumnRenamed("c1_split_2", "c3")

# Display the DataFrame
display(df_split)

item_id,c4,c1,c2,c3
1,d,a,b,c


#### **Question 03**

In [0]:
# Sample data
data = [(1, ["a", "b", "c"], "a"),
        (2, ["g", None, "c"], "b"),
        (3, ["m", "c"], "c"),
        (4, ["n"], "d")
]

# Define schema
columns = ["item_id", "value", "index"]

# Create DataFrame
df_Q41 = spark.createDataFrame(data, columns)
display(df_Q41)

item_id,value,index
1,"List(a, b, c)",a
2,"List(g, null, c)",b
3,"List(m, c)",c
4,List(n),d


#### **Split Array values into seperate columns**

**Method 01**

In [0]:
df_Q41.select("item_id", df_Q41.value[0], df_Q41.value[1], df_Q41.value[2], "index").display()

item_id,value[0],value[1],value[2],index
1,a,b,c,a
2,g,,c,b
3,m,c,,c
4,n,,,d


**Method 02**

In [0]:
# Determine the maximum number of elements in the 'value' column
dfsize = df_Q41.select("item_id", "value", size("value").alias('NoOfArrayElements'))
display(dfsize)

item_id,value,NoOfArrayElements
1,"List(a, b, c)",3
2,"List(g, null, c)",3
3,"List(m, c)",2
4,List(n),1


In [0]:
# max_value = dfsize.agg({"NoOfArrayElements": "max"}).collect()[0][0]
max_value = dfsize.agg(max(col("NoOfArrayElements")).alias('NoOfArrayElements')).collect()[0][0]
print(max_value)

3


In [0]:
# Function to split array into columns
def arraySplitIntoCols(df, maxElements):
    for i in range(maxElements):
        df = df.withColumn(f"new_col_{i}", df.value[i])
    return df

In [0]:
# Split the 'value' column into separate columns
dfout = arraySplitIntoCols(df_Q41, max_value)
# Display the DataFrame
display(dfout)

item_id,value,index,new_col_0,new_col_1,new_col_2
1,"List(a, b, c)",a,a,b,c
2,"List(g, null, c)",b,g,,c
3,"List(m, c)",c,m,c,
4,List(n),d,n,,
