In [0]:
data1=[(1,"A",1000,"IT"),(2,"B",1500,"IT"),(3,"C",2500,"IT"),(4,"D",3000,"HR"),(5,"E",2000,"HR"),(6,"F",1000,"HR")
       ,(7,"G",40000,"Sales"),(8,"H",4000,"Sales"),(9,"I",1000,"Sales"),(10,"J",2000,"Sales")]
schema1=["EmpId","EmpName","Salary","DeptName"]
df=spark.createDataFrame(data1,schema1)

df.printSchema()
display(df)

root
 |-- EmpId: long (nullable = true)
 |-- EmpName: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- DeptName: string (nullable = true)



EmpId,EmpName,Salary,DeptName
1,A,1000,IT
2,B,1500,IT
3,C,2500,IT
4,D,3000,HR
5,E,2000,HR
6,F,1000,HR
7,G,40000,Sales
8,H,4000,Sales
9,I,1000,Sales
10,J,2000,Sales


In [0]:
#Find out highset emp_salary Department wise
from pyspark.sql import Window
from pyspark.sql.functions import col,max,min,rank

window_spec=Window.partitionBy('DeptName').orderBy(col('Salary').desc())

df_mod=df.withColumn('score',rank().over(window_spec))
display(df_mod.filter(col('score')==1))

EmpId,EmpName,Salary,DeptName,score
4,D,3000,HR,1
3,C,2500,IT,1
7,G,40000,Sales,1


In [0]:
#Employees Salary info
data1=[(100,"Raj",None,1,"01-04-23",50000),
       (200,"Joanne",100,1,"01-04-23",4000),(200,"Joanne",100,1,"13-04-23",4500),(200,"Joanne",100,1,"14-04-23",4020)]
schema1=["EmpId","EmpName","Mgrid","deptid","salarydt","salary"]
df_salary=spark.createDataFrame(data1,schema1)
display(df_salary)
#department dataframe
data2=[(1,"IT"), (2,"HR")]
schema2=["deptid","deptname"]
df_dept=spark.createDataFrame(data2,schema2)
display(df_dept)

EmpId,EmpName,Mgrid,deptid,salarydt,salary
100,Raj,,1,01-04-23,50000
200,Joanne,100.0,1,01-04-23,4000
200,Joanne,100.0,1,13-04-23,4500
200,Joanne,100.0,1,14-04-23,4020


deptid,deptname
1,IT
2,HR


In [0]:
from pyspark.sql.functions import *
df=df_salary.withColumn('Newsaldt',to_date('salarydt','dd-MM-yy'))
df.show()

+-----+-------+-----+------+--------+------+----------+
|EmpId|EmpName|Mgrid|deptid|salarydt|salary|  Newsaldt|
+-----+-------+-----+------+--------+------+----------+
|  100|    Raj| null|     1|01-04-23| 50000|2023-04-01|
|  200| Joanne|  100|     1|01-04-23|  4000|2023-04-01|
|  200| Joanne|  100|     1|13-04-23|  4500|2023-04-13|
|  200| Joanne|  100|     1|14-04-23|  4020|2023-04-14|
+-----+-------+-----+------+--------+------+----------+



In [0]:
df1=df.join(df_dept,['deptid'])
df1.show()

+------+-----+-------+-----+--------+------+----------+--------+
|deptid|EmpId|EmpName|Mgrid|salarydt|salary|  Newsaldt|deptname|
+------+-----+-------+-----+--------+------+----------+--------+
|     1|  100|    Raj| null|01-04-23| 50000|2023-04-01|      IT|
|     1|  200| Joanne|  100|01-04-23|  4000|2023-04-01|      IT|
|     1|  200| Joanne|  100|13-04-23|  4500|2023-04-13|      IT|
|     1|  200| Joanne|  100|14-04-23|  4020|2023-04-14|      IT|
+------+-----+-------+-----+--------+------+----------+--------+



In [0]:
from pyspark.sql.functions import year,month
df2=df1.alias('a').join(df1.alias('b'),col('a.Mgrid')==col('b.EmpId'),how='left')\
    .select( col('a.deptname'), col('b.EmpName').alias('ManagerName'),col('a.EmpName'),\
    col('a.Newsaldt'),col('a.salary'))


In [0]:
display(df2)

deptname,ManagerName,EmpName,Newsaldt,salary
IT,,Raj,2023-04-01,50000
IT,Raj,Joanne,2023-04-01,4000
IT,Raj,Joanne,2023-04-13,4500
IT,Raj,Joanne,2023-04-14,4020


In [0]:
df3=df2.withColumn('year',year(col('Newsaldt')))\
     .withColumn('month',month(col('Newsaldt')))
                 


In [0]:
display(df3)

deptname,ManagerName,EmpName,Newsaldt,salary,year,month
IT,,Raj,2023-04-01,50000,2023,4
IT,Raj,Joanne,2023-04-01,4000,2023,4
IT,Raj,Joanne,2023-04-13,4500,2023,4
IT,Raj,Joanne,2023-04-14,4020,2023,4


**explode functions**

In [0]:
studentdata=[(1,"Julie","23||87||54"),(6,"Mr.T","93||67||24"),(5,"Mark","50||20||84")]
stuschema=["id","name","marks"]

df_st=spark.createDataFrame(studentdata,stuschema)
display(df_st)

id,name,marks
1,Julie,23||87||54
6,Mr.T,93||67||24
5,Mark,50||20||84


In [0]:
from pyspark.sql.functions import split
df_st=df_st.withColumn('Maths',split(col('marks'),"||")[0])\
        .withColumn('English',split(col('marks'),'||')[1])\
        .withColumn('Geography',split(col('marks'),'||')[2])

display(df_st)        

id,name,marks,Maths,English,Geography
1,Julie,23||87||54,2,3,|
6,Mr.T,93||67||24,9,3,|
5,Mark,50||20||84,5,0,|


In [0]:
### find no. of nulls in each columns

dt=[(1,"Julie",None),(None,"Mr.T","93||67||24"),(5,None,"50||20||84"),(None,"Mark",None)]
dtchema=["id","name","marks"]

In [0]:
dt_df=spark.createDataFrame(dt,dtchema)
display(dt_df)

id,name,marks
1.0,Julie,
,Mr.T,93||67||24
5.0,,50||20||84
,Mark,


In [0]:
from pyspark.sql.functions import *
#display(dt_df.select([count(i) for i in dt_df.columns]))

#display(dt_df.select([when(col(i).isNull(),i ) for i in dt_df.columns]))
dt_df.select([count(when(isnull(c), c)).alias(c) for c in dt_df.columns]).show()

+---+----+-----+
| id|name|marks|
+---+----+-----+
|  2|   1|    2|
+---+----+-----+



In [0]:
dict(dt_df.dtypes)

Out[15]: {'id': 'bigint', 'name': 'string', 'marks': 'string'}

In [0]:
for col in dt_df.columns:
    print(f"Column name is {col} and datatype is {dict(dt_df.dtypes)[col]}")

Column name is id and datatype is bigint
Column name is name and datatype is string
Column name is marks and datatype is string


In [0]:
## collect_list and aggregation

data = [
    ("john", "tomato", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
    ("john", "𝚋𝚊𝚗𝚊𝚗𝚊", 2),
    ("john", "tomato", 3),
    ("𝚋𝚒𝚕𝚕", "𝚝𝚊𝚌𝚘", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
]
schema = "name string,item string,weight int"
df = spark.createDataFrame(data, schema)

In [0]:
display(df)

name,item,weight
john,tomato,2
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,2
john,𝚋𝚊𝚗𝚊𝚗𝚊,2
john,tomato,3
𝚋𝚒𝚕𝚕,𝚝𝚊𝚌𝚘,2
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,2


In [0]:
df_final=df.groupBy("name").agg(collect_list("item"))
display(df_final)

name,collect_list(item)
john,"List(tomato, 𝚋𝚊𝚗𝚊𝚗𝚊, tomato)"
𝚋𝚒𝚕𝚕,"List(𝚊𝚙𝚙𝚕𝚎, 𝚝𝚊𝚌𝚘, 𝚊𝚙𝚙𝚕𝚎)"


In [0]:
df_final=df.groupBy("name").agg(collect_list(struct("item","weight")).alias("items"))
display(df_final)

name,items
john,"List(List(tomato, 2), List(𝚋𝚊𝚗𝚊𝚗𝚊, 2), List(tomato, 3))"
𝚋𝚒𝚕𝚕,"List(List(𝚊𝚙𝚙𝚕𝚎, 2), List(𝚝𝚊𝚌𝚘, 2), List(𝚊𝚙𝚙𝚕𝚎, 2))"


In [0]:
df_final.printSchema()

root
 |-- name: string (nullable = true)
 |-- items: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- item: string (nullable = true)
 |    |    |-- weight: integer (nullable = true)



In [0]:
df_final1=df_final.withColumn('newitem',explode(col('items'))).drop("items")
display(df_final1)


[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-2026787703974019>:1[0m
[0;32m----> 1[0m df_final1[38;5;241m=[39mdf_final[38;5;241m.[39mwithColumn([38;5;124m'[39m[38;5;124mnewitem[39m[38;5;124m'[39m,explode(col([38;5;124m'[39m[38;5;124mitems[39m[38;5;124m'[39m)))[38;5;241m.[39mdrop([38;5;124m"[39m[38;5;124mitems[39m[38;5;124m"[39m)
[1;32m      2[0m display(df_final1)

[0;31mTypeError[0m: 'str' object is not callable

In [0]:
df_final1.printSchema()



In [0]:
## Check data types and multiply if doubletype

data_df = [
    ("john", "tomato", 2.00),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2.99),
    ("john", "𝚋𝚊𝚗𝚊𝚗𝚊", 23.79),
    ("john", "tomato", 33.33),
    ("𝚋𝚒𝚕𝚕", "𝚝𝚊𝚌𝚘", 19.86),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 65.86),
]



In [0]:
from pyspark.sql.types import StructType,StringType,IntegerType,DoubleType,StructField

schema_1=StructType([StructField("name",StringType()),
                     StructField("fruit",StringType()),
                     StructField("price",DoubleType())])



In [0]:
df_ch=spark.createDataFrame(data_df,schema_1)
display(df_ch)



In [0]:
df_ch.printSchema()



In [0]:
dict(df_ch.dtypes)



In [0]:
for col,coltype in dict(df_ch.dtypes).items():
    print(f"col is {col} and datatype is {coltype})")



In [0]:
#Employees Salary info
data1=[(100,"Raj",None,1,"01-04-23",50000),
       (200,"Joanne",100,1,"01-04-23",4000),(200,"Joanne",100,1,"13-04-23",4500),(200,"Joanne",100,1,"14-04-23",4020)]
schema1=["EmpId","EmpName","Mgrid","deptid","salarydt","salary"]
df_salary=spark.createDataFrame(data1,schema1)
display(df_salary)
#department dataframe
data2=[(1,"IT"),
       (2,"HR")]
schema2=["deptid","deptname"]
df_dept=spark.createDataFrame(data2,schema2)
display(df_dept)



In [0]:
from pyspark.sql.functions import date_format,to_date,col,year,month

emp_dept1=df_salary.withColumn('salary_date',to_date(col('salarydt'),'dd-MM-yy'))
display(emp_dept1)



In [0]:
emp_dept2=emp_dept1.withColumn('salary_year',year(col('salary_date')))\
                    .withColumn('salary_month',month(col('salary_date')))\
                    .drop('salarydt')
emp_dept3=emp_dept2.select("Empid", "EmpName","deptid","salary_year","salary_month","salary","Mgrid")
display(emp_dept2)    
display(emp_dept3)



In [0]:
emp_dept4=emp_dept3.groupBy('salary_month','Empid','EmpName','deptid','Mgrid').sum('salary')
emp_dept4.show()



In [0]:
emp_dept_rv=emp_dept4.join(df_dept,emp_dept4.deptid==df_dept.deptid,'inner').drop(df_dept.deptid)
display(emp_dept_rv)



In [0]:
df_rv=emp_dept_rv.alias('a').join(emp_dept_rv.alias('b'),col('a.Mgrid')==col('b.EmpId'),'left').select(
    col('a.deptname'),
    col('b.EmpName').alias('ManagerName'),
    col('a.EmpName'),
    col('a.salary_month'),
    col('a.sum(salary)')
)
display(df_rv)



In [0]:
## q2
dept_data=[('IT','M'),('IT','F'),('IT','M'),('IT','M'),
           ('HR','F'),('HR','F'),('HR','F'),('HR','F'),('HR','F'),
           ('SALES','M'),('SALES','M'),('SALES','F')]

dept_schema=['Deptname','Gender']    

dept_df=spark.createDataFrame(dept_data,dept_schema)
display(dept_df)



In [0]:
from pyspark.sql.functions import when,col

dept_df_rv=dept_df.select('Deptname',when(col('Gender')=='M',1).alias('Male'),when(col('Gender')=='F',1).alias('Female'))

display(dept_df_rv)



In [0]:
from pyspark.sql.functions import count,sum

display(
        dept_df_rv.groupBy('Deptname').agg(count('Deptname').alias('TotaCount'),\
            sum('Male').alias('Malecount'),sum('Female').alias('Femalecount'))
        )



In [0]:
#q3
data=[(1,'John','ADF'),(1,'John','ADB'),(1,'John','PowerBI'),(2,'Joanne','ADF'),(2,'Joanne','SQL'),(2,'Joanne','Crystal Report'),(3,'Vikas','ADF'),(3,'Vikas','SQL'),(3,'Vikas','SSIS'),(4,'Monu','SQL'),(4,'Monu','SSIS'),(4,'Monu','SSAS'),(4,'Monu','ADF')]
schema=["EmpId","EmpName","Skill"]
df1=spark.createDataFrame(data,schema)
display(df1)



In [0]:
from pyspark.sql.functions import collect_list,concat_ws

df22=df1.groupBy('EmpName').agg(collect_list('Skill').alias('skills'))
display(df22)



In [0]:
display(df22.select(df22.EmpName,concat_ws(',',df22.skills).alias('skills')))



### Q4-Amazon-Remove all reversed pair

In [0]:

from pyspark.sql.types import *
data=[(1,2),(3,2),(2,4),(2,1),(5,6),(4,2)]
schema=['A','B']
emp_df=spark.createDataFrame(data,schema)
display(emp_df)

A,B
1,2
3,2
2,4
2,1
5,6
4,2


In [0]:
##1st Approach-- By having min max value
from pyspark.sql.functions import least, greatest, col

emp_df1 = emp_df.withColumn('min_val', least(col('A'), col('B'))) \
                .withColumn('max_val', greatest(col('A'), col('B')))

emp_df_unique = emp_df1.select('min_val', 'max_val').dropDuplicates() \
                       .withColumnRenamed('min_val', 'A') \
                       .withColumnRenamed('max_val', 'B')

emp_df_unique.show()


+---+---+
|  A|  B|
+---+---+
|  1|  2|
|  2|  3|
|  2|  4|
|  5|  6|
+---+---+



In [0]:
##2nd Approach- By self join

emp_df_joined = (
    emp_df.alias('e')
    .join(
        emp_df.alias('g'),
        (col('e.A') == col('g.B')) & (col('e.B') == col('g.A')),
        'left_outer'
    )
    .filter((col('g.A').isNull()) | (col('e.A') < col('e.B')))
    .select(col('e.A').alias('A'), col('e.B').alias('B'))
)
emp_df_joined.show()


+---+---+
|  A|  B|
+---+---+
|  1|  2|
|  3|  2|
|  2|  4|
|  5|  6|
+---+---+



### Q5-Amazon-Total amount recieved via cash and online merchant

In [0]:
data=[('2022-04-02','merchant_1',150,'cash'),('2022-04-02','merchant_1',500,'online'),('2022-04-03','merchant_2',450,'online'),('2022-04-03','merchant_1',100,'cash'), ('2022-04-03','merchant_3',600,'cash'),('2022-04-05','merchant_5',200,'online'),('2022-04-05','merchant_2',100,'online')]
headers=['trx_date', 'merchant', 'amount', 'pay_mode']

In [0]:
trx_df=spark.createDataFrame(data,headers)
trx_df.show()

+----------+----------+------+--------+
|  trx_date|  merchant|amount|pay_mode|
+----------+----------+------+--------+
|2022-04-02|merchant_1|   150|    cash|
|2022-04-02|merchant_1|   500|  online|
|2022-04-03|merchant_2|   450|  online|
|2022-04-03|merchant_1|   100|    cash|
|2022-04-03|merchant_3|   600|    cash|
|2022-04-05|merchant_5|   200|  online|
|2022-04-05|merchant_2|   100|  online|
+----------+----------+------+--------+



In [0]:
trx_df1=(
   trx_df.groupBy('merchant').pivot('pay_mode').sum('amount') 
)
trx_df1.show()

+----------+----+------+
|  merchant|cash|online|
+----------+----+------+
|merchant_3| 600|  null|
|merchant_1| 250|   500|
|merchant_2|null|   550|
|merchant_5|null|   200|
+----------+----+------+



### Q6-Google-Unique user count

In [0]:
data=[('2022-02-20',1,"abc"),('2022-02-20',2,"xyz"),('2022-02-22',1,"xyz"),('2022-02-22',3,"klm"),('2022-02-24',1,"abc"),
('2022-02-24',2,"abc"),('2022-02-24',3,"abc")]
schema=['date','user_id','activity']

user_df=spark.createDataFrame(data,schema)
user_df.show()
    

+----------+-------+--------+
|      date|user_id|activity|
+----------+-------+--------+
|2022-02-20|      1|     abc|
|2022-02-20|      2|     xyz|
|2022-02-22|      1|     xyz|
|2022-02-22|      3|     klm|
|2022-02-24|      1|     abc|
|2022-02-24|      2|     abc|
|2022-02-24|      3|     abc|
+----------+-------+--------+



In [0]:
from pyspark.sql.window import Window

window_spec=Window.partitionBy('user_id').orderBy('date')

display(user_df.withColumn('rnum',row_number().over(window_spec)))

date,user_id,activity,rnum
2022-02-20,1,abc,1
2022-02-22,1,xyz,2
2022-02-24,1,abc,3
2022-02-20,2,xyz,1
2022-02-24,2,abc,2
2022-02-22,3,klm,1
2022-02-24,3,abc,2
