In [0]:
# Challenge: Given a dataset with columns PERSON, TYPE, and AGE,
# create an output where the oldest adult is paired with the youngest child, producing pairs of ADULT and CHILD while ensuring appropriate data matching.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number, desc, asc
from pyspark.sql.window import Window



class AdultChildPair:
  def createData(self):
    data = [
      ('A1', 'ADULT', 54),
      ('A2', 'ADULT', 53),
      ('A3', 'ADULT', 52),
      ('A4', 'ADULT', 58),
      ('A5', 'ADULT', 54),
      ('C1', 'CHILD', 20),
      ('C2', 'CHILD', 19),
      ('C3', 'CHILD', 22),
      ('C4', 'CHILD', 15)
    ]
    columns = ['person', 'type', 'age']
    return spark.createDataFrame(data, columns)

In [0]:
ob = AdultChildPair()
inputDf = ob.createData()

In [0]:
display(inputDf)

person,type,age
A1,ADULT,54
A2,ADULT,53
A3,ADULT,52
A4,ADULT,58
A5,ADULT,54
C1,CHILD,20
C2,CHILD,19
C3,CHILD,22
C4,CHILD,15


In [0]:
# Algo :
#     1. i will convert into two dataframe one contain adult data and another contain child data
#     2. Adult df i put desc and child data i will put asc
#     3. join based on rank and will correct output


In [0]:

ChildDf = inputDf.filter(col("type").isin("CHILD"))
window = Window.orderBy(asc('age')) 
RankChildDf = ChildDf.withColumn("rank",row_number().over(window))
display(RankChildDf)

person,type,age,rank
C4,CHILD,15,1
C2,CHILD,19,2
C1,CHILD,20,3
C3,CHILD,22,4


In [0]:
AdultDf = inputDf.filter(col("type").isin("ADULT"))
window = Window.orderBy(desc('age')) 
RankAdultDf = AdultDf.withColumn("rank",row_number().over(window))
display(RankAdultDf)

person,type,age,rank
A4,ADULT,58,1
A1,ADULT,54,2
A5,ADULT,54,3
A2,ADULT,53,4
A3,ADULT,52,5


In [0]:
finalResultOutput = RankAdultDf.alias('A').join(RankChildDf.alias('C'),"rank",how = 'full')
display(finalResultOutput)

rank,person,type,age,person.1,type.1,age.1
1,A4,ADULT,58,C4,CHILD,15.0
2,A1,ADULT,54,C2,CHILD,19.0
3,A5,ADULT,54,C1,CHILD,20.0
4,A2,ADULT,53,C3,CHILD,22.0
5,A3,ADULT,52,,,


In [0]:
display(finalResultOutput.selectExpr("A.person","C.person"))

person,person.1
A4,C4
A1,C2
A5,C1
A2,C3
A3,
