**1. First name middle name last name**

In [0]:
import pyspark.sql.functions as F

In [0]:
data = [
    ('Virat Kohli',),
    ('Narendra Damodardas Modi',),
    ('Salman',)
]

schema = "celebrity_name string"

df = spark.createDataFrame(data = data , schema = schema)
df.display()

celebrity_name
Virat Kohli
Narendra Damodardas Modi
Salman


In [0]:
df_final=(
    df.withColumn('Firstname',F.split(F.col('celebrity_name')," ")[0])
       .withColumn('Middlename',F.split(F.col('celebrity_name')," ")[1])
       .withColumn('Lastname',F.split(F.col('celebrity_name')," ")[2]) 
)
df_final.show(truncate=False)

+------------------------+---------+----------+--------+
|celebrity_name          |Firstname|Middlename|Lastname|
+------------------------+---------+----------+--------+
|Virat Kohli             |Virat    |Kohli     |null    |
|Narendra Damodardas Modi|Narendra |Damodardas|Modi    |
|Salman                  |Salman   |null      |null    |
+------------------------+---------+----------+--------+



**2. Lift capacity**

In [0]:
lift_data = [
    (1,300),
    (2,350)
]

lift_schema = "id int , capacity_kg int"

lift_df = spark.createDataFrame(data = lift_data , schema = lift_schema)

In [0]:
lift_passengers_data = [
    ('Rahul',85,1),
    ('Adarsh',73,1),
    ('Riti',95,1),
    ('Viraj',80,1),
    ('Vimal',83,2),
    ('Neha',77,2),
    ('Priti',73,2),
    ('Himanshi',85,2)
]

lift_passengers_schema = "passenger_name string , weight_kg int, lift_id int"

lift_passengers_df = spark.createDataFrame(data = lift_passengers_data , schema = lift_passengers_schema)

In [0]:
lift_df.display()
lift_passengers_df.display()

id,capacity_kg
1,300
2,350


passenger_name,weight_kg,lift_id
Rahul,85,1
Adarsh,73,1
Riti,95,1
Viraj,80,1
Vimal,83,2
Neha,77,2
Priti,73,2
Himanshi,85,2


In [0]:
lift_capacity_passenger_df=(
   lift_passengers_df.alias('p')
   .join(lift_df.alias('l'),F.col('p.lift_id')==F.col('l.id') , 'inner') 
   .drop('id')
)
lift_capacity_passenger_df.show()

+--------------+---------+-------+-----------+
|passenger_name|weight_kg|lift_id|capacity_kg|
+--------------+---------+-------+-----------+
|         Rahul|       85|      1|        300|
|        Adarsh|       73|      1|        300|
|          Riti|       95|      1|        300|
|         Viraj|       80|      1|        300|
|         Vimal|       83|      2|        350|
|          Neha|       77|      2|        350|
|         Priti|       73|      2|        350|
|      Himanshi|       85|      2|        350|
+--------------+---------+-------+-----------+



In [0]:
lift_capacity_passenger_df.printSchema()

root
 |-- passenger_name: string (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- lift_id: integer (nullable = true)
 |-- capacity_kg: integer (nullable = true)



In [0]:
lift_capacity_passenger_df.createOrReplaceTempView('lift_capacity_passenger')

In [0]:
%sql
WITH DATASET AS (
SELECT *
,sum(weight_kg) over(PARTITION BY lift_id ORDER BY weight_kg) AS running_wt
 FROM lift_capacity_passenger
)
SELECT lift_id,
collect_list(passenger_name) AS passenger_list
--(passenger_name ,',') WITHIN GROUP (order by weight_kg) as passenger_list
 FROM DATASET
 WHERE running_wt<=capacity_kg
 GROUP BY lift_id

lift_id,passenger_list
1,"List(Adarsh, Viraj, Rahul)"
2,"List(Priti, Neha, Vimal, Himanshi)"


**3. Popularity percentage of user** 

In [0]:
data = [
    (1,5),
    (1,3),
    (1,6),
    (2,1),
    (2,6),
    (3,9),
    (4,1),
    (7,2),
    (8,3)
]
schema ="user1 int, user2 int"

df = spark.createDataFrame(data = data , schema = schema)
df.show()

+-----+-----+
|user1|user2|
+-----+-----+
|    1|    5|
|    1|    3|
|    1|    6|
|    2|    1|
|    2|    6|
|    3|    9|
|    4|    1|
|    7|    2|
|    8|    3|
+-----+-----+



In [0]:
df_friend=(
    #df.groupBy('user1').agg((F.countDistinct('user2')*100/F.countDistinct('user1')).alias('popularity_percentage'))
    df.groupBy('user1').agg((F.countDistinct('user2')).alias('frnd_per_user'))
)
df_friend.show()

+-----+-------------+
|user1|frnd_per_user|
+-----+-------------+
|    1|            3|
|    3|            1|
|    4|            1|
|    8|            1|
|    7|            1|
|    2|            2|
+-----+-------------+



In [0]:
df_frnd_cnt=(
    df.join(df_friend,df.user1==df_friend.user1)
    .drop(df_friend.user1)
)
df_frnd_cnt.show()

+-----+-----+-------------+
|user2|user1|frnd_per_user|
+-----+-----+-------------+
|    5|    1|            3|
|    3|    1|            3|
|    6|    1|            3|
|    1|    2|            2|
|    6|    2|            2|
|    9|    3|            1|
|    1|    4|            1|
|    3|    8|            1|
|    2|    7|            1|
+-----+-----+-------------+



In [0]:
df_pop=(
    df_frnd_cnt.groupBy('user1').agg((max('frnd_per_user')/))
)