In [0]:
data1=[(1,"A",1000,"IT"),(2,"B",1500,"IT"),(3,"C",2500,"IT"),(4,"D",3000,"HR"),(5,"E",2000,"HR"),(6,"F",1000,"HR")
       ,(7,"G",40000,"Sales"),(8,"H",4000,"Sales"),(9,"I",1000,"Sales"),(10,"J",2000,"Sales")]
schema1=["EmpId","EmpName","Salary","DeptName"]
df=spark.createDataFrame(data1,schema1)

df.printSchema()
display(df)

root
 |-- EmpId: long (nullable = true)
 |-- EmpName: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- DeptName: string (nullable = true)



EmpId,EmpName,Salary,DeptName
1,A,1000,IT
2,B,1500,IT
3,C,2500,IT
4,D,3000,HR
5,E,2000,HR
6,F,1000,HR
7,G,40000,Sales
8,H,4000,Sales
9,I,1000,Sales
10,J,2000,Sales


In [0]:
#Find out highset emp_salary Department wise
from pyspark.sql import Window
from pyspark.sql.functions import col,max,min,rank

window_spec=Window.partitionBy('DeptName').orderBy(col('Salary').desc())

df_mod=df.withColumn('score',rank().over(window_spec))
display(df_mod.filter(col('score')==1))

EmpId,EmpName,Salary,DeptName,score
4,D,3000,HR,1
3,C,2500,IT,1
7,G,40000,Sales,1


In [0]:
#Employees Salary info
data1=[(100,"Raj",None,1,"01-04-23",50000),
       (200,"Joanne",100,1,"01-04-23",4000),(200,"Joanne",100,1,"13-04-23",4500),(200,"Joanne",100,1,"14-04-23",4020)]
schema1=["EmpId","EmpName","Mgrid","deptid","salarydt","salary"]
df_salary=spark.createDataFrame(data1,schema1)
display(df_salary)
#department dataframe
data2=[(1,"IT"), (2,"HR")]
schema2=["deptid","deptname"]
df_dept=spark.createDataFrame(data2,schema2)
display(df_dept)

EmpId,EmpName,Mgrid,deptid,salarydt,salary
100,Raj,,1,01-04-23,50000
200,Joanne,100.0,1,01-04-23,4000
200,Joanne,100.0,1,13-04-23,4500
200,Joanne,100.0,1,14-04-23,4020


deptid,deptname
1,IT
2,HR


In [0]:
from pyspark.sql.functions import *
df=df_salary.withColumn('Newsaldt',to_date('salarydt','dd-MM-yy'))
df.show()

+-----+-------+-----+------+--------+------+----------+
|EmpId|EmpName|Mgrid|deptid|salarydt|salary|  Newsaldt|
+-----+-------+-----+------+--------+------+----------+
|  100|    Raj| NULL|     1|01-04-23| 50000|2023-04-01|
|  200| Joanne|  100|     1|01-04-23|  4000|2023-04-01|
|  200| Joanne|  100|     1|13-04-23|  4500|2023-04-13|
|  200| Joanne|  100|     1|14-04-23|  4020|2023-04-14|
+-----+-------+-----+------+--------+------+----------+



In [0]:
df1=df.join(df_dept,['deptid'])
df1.show()

+------+-----+-------+-----+--------+------+----------+--------+
|deptid|EmpId|EmpName|Mgrid|salarydt|salary|  Newsaldt|deptname|
+------+-----+-------+-----+--------+------+----------+--------+
|     1|  100|    Raj| NULL|01-04-23| 50000|2023-04-01|      IT|
|     1|  200| Joanne|  100|01-04-23|  4000|2023-04-01|      IT|
|     1|  200| Joanne|  100|13-04-23|  4500|2023-04-13|      IT|
|     1|  200| Joanne|  100|14-04-23|  4020|2023-04-14|      IT|
+------+-----+-------+-----+--------+------+----------+--------+



In [0]:
from pyspark.sql.functions import year,month
df2=df1.alias('a').join(df1.alias('b'),col('a.Mgrid')==col('b.EmpId'),how='left')\
    .select( col('a.deptname'), col('b.EmpName').alias('ManagerName'),col('a.EmpName'),\
    col('a.Newsaldt'),col('a.salary'))


In [0]:
display(df2)

deptname,ManagerName,EmpName,Newsaldt,salary
IT,,Raj,2023-04-01,50000
IT,Raj,Joanne,2023-04-01,4000
IT,Raj,Joanne,2023-04-13,4500
IT,Raj,Joanne,2023-04-14,4020


In [0]:
df3=df2.withColumn('year',year(col('Newsaldt')))\
     .withColumn('month',month(col('Newsaldt')))
                 


In [0]:
display(df3)

deptname,ManagerName,EmpName,Newsaldt,salary,year,month
IT,,Raj,2023-04-01,50000,2023,4
IT,Raj,Joanne,2023-04-01,4000,2023,4
IT,Raj,Joanne,2023-04-13,4500,2023,4
IT,Raj,Joanne,2023-04-14,4020,2023,4


**explode functions**

In [0]:
studentdata=[(1,"Julie","23||87||54"),(6,"Mr.T","93||67||24"),(5,"Mark","50||20||84")]
stuschema=["id","name","marks"]

df_st=spark.createDataFrame(studentdata,stuschema)
display(df_st)

id,name,marks
1,Julie,23||87||54
6,Mr.T,93||67||24
5,Mark,50||20||84


In [0]:
from pyspark.sql.functions import split
df_st=df_st.withColumn('Maths',split(col('marks'),"||")[0])\
        .withColumn('English',split(col('marks'),'||')[1])\
        .withColumn('Geography',split(col('marks'),'||')[2])

display(df_st)        

id,name,marks,Maths,English,Geography
1,Julie,23||87||54,2,3,|
6,Mr.T,93||67||24,9,3,|
5,Mark,50||20||84,5,0,|


In [0]:
### find no. of nulls in each columns

dt=[(1,"Julie",None),(None,"Mr.T","93||67||24"),(5,None,"50||20||84"),(None,"Mark",None)]
dtchema=["id","name","marks"]

In [0]:
dt_df=spark.createDataFrame(dt,dtchema)
display(dt_df)

id,name,marks
1.0,Julie,
,Mr.T,93||67||24
5.0,,50||20||84
,Mark,


In [0]:
from pyspark.sql.functions import *
#display(dt_df.select([count(i) for i in dt_df.columns]))

#display(dt_df.select([when(col(i).isNull(),i ) for i in dt_df.columns]))
dt_df.select([count(when(isnull(c), c)).alias(c) for c in dt_df.columns]).show()

+---+----+-----+
| id|name|marks|
+---+----+-----+
|  2|   1|    2|
+---+----+-----+



In [0]:
dict(dt_df.dtypes)

{'id': 'bigint', 'name': 'string', 'marks': 'string'}

In [0]:
for col in dt_df.columns:
    print(f"Column name is {col} and datatype is {dict(dt_df.dtypes)[col]}")

Column name is id and datatype is bigint
Column name is name and datatype is string
Column name is marks and datatype is string


In [0]:
## collect_list and aggregation

data = [
    ("john", "tomato", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
    ("john", "𝚋𝚊𝚗𝚊𝚗𝚊", 2),
    ("john", "tomato", 3),
    ("𝚋𝚒𝚕𝚕", "𝚝𝚊𝚌𝚘", 2),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2),
]
schema = "name string,item string,weight int"
df = spark.createDataFrame(data, schema)

In [0]:
display(df)

name,item,weight
john,tomato,2
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,2
john,𝚋𝚊𝚗𝚊𝚗𝚊,2
john,tomato,3
𝚋𝚒𝚕𝚕,𝚝𝚊𝚌𝚘,2
𝚋𝚒𝚕𝚕,𝚊𝚙𝚙𝚕𝚎,2


In [0]:
df_final=df.groupBy("name").agg(collect_list("item"))
display(df_final)

name,collect_list(item)
john,"List(tomato, 𝚋𝚊𝚗𝚊𝚗𝚊, tomato)"
𝚋𝚒𝚕𝚕,"List(𝚊𝚙𝚙𝚕𝚎, 𝚝𝚊𝚌𝚘, 𝚊𝚙𝚙𝚕𝚎)"


In [0]:
df_final=df.groupBy("name").agg(collect_list(struct("item","weight")).alias("items"))
display(df_final)

name,items
john,"List(List(tomato, 2), List(𝚋𝚊𝚗𝚊𝚗𝚊, 2), List(tomato, 3))"
𝚋𝚒𝚕𝚕,"List(List(𝚊𝚙𝚙𝚕𝚎, 2), List(𝚝𝚊𝚌𝚘, 2), List(𝚊𝚙𝚙𝚕𝚎, 2))"


In [0]:
df_final.printSchema()

root
 |-- name: string (nullable = true)
 |-- items: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- item: string (nullable = true)
 |    |    |-- weight: integer (nullable = true)



In [0]:
df_final1=df_final.withColumn('newitem',explode(col('items'))).drop("items")
display(df_final1)


[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-2026787703974019>, line 1[0m
[0;32m----> 1[0m df_final1[38;5;241m=[39mdf_final[38;5;241m.[39mwithColumn([38;5;124m'[39m[38;5;124mnewitem[39m[38;5;124m'[39m,explode(col([38;5;124m'[39m[38;5;124mitems[39m[38;5;124m'[39m)))[38;5;241m.[39mdrop([38;5;124m"[39m[38;5;124mitems[39m[38;5;124m"[39m)
[1;32m      2[0m display(df_final1)

[0;31mTypeError[0m: 'str' object is not callable

In [0]:
df_final1.printSchema()



In [0]:
## Check data types and multiply if doubletype

data_df = [
    ("john", "tomato", 2.00),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 2.99),
    ("john", "𝚋𝚊𝚗𝚊𝚗𝚊", 23.79),
    ("john", "tomato", 33.33),
    ("𝚋𝚒𝚕𝚕", "𝚝𝚊𝚌𝚘", 19.86),
    ("𝚋𝚒𝚕𝚕", "𝚊𝚙𝚙𝚕𝚎", 65.86),
]



In [0]:
from pyspark.sql.types import StructType,StringType,IntegerType,DoubleType,StructField

schema_1=StructType([StructField("name",StringType()),
                     StructField("fruit",StringType()),
                     StructField("price",DoubleType())])



In [0]:
df_ch=spark.createDataFrame(data_df,schema_1)
display(df_ch)



In [0]:
df_ch.printSchema()



In [0]:
dict(df_ch.dtypes)



In [0]:
for col,coltype in dict(df_ch.dtypes).items():
    print(f"col is {col} and datatype is {coltype})")



In [0]:
#Employees Salary info
data1=[(100,"Raj",None,1,"01-04-23",50000),
       (200,"Joanne",100,1,"01-04-23",4000),(200,"Joanne",100,1,"13-04-23",4500),(200,"Joanne",100,1,"14-04-23",4020)]
schema1=["EmpId","EmpName","Mgrid","deptid","salarydt","salary"]
df_salary=spark.createDataFrame(data1,schema1)
display(df_salary)
#department dataframe
data2=[(1,"IT"),
       (2,"HR")]
schema2=["deptid","deptname"]
df_dept=spark.createDataFrame(data2,schema2)
display(df_dept)



In [0]:
from pyspark.sql.functions import date_format,to_date,col,year,month

emp_dept1=df_salary.withColumn('salary_date',to_date(col('salarydt'),'dd-MM-yy'))
display(emp_dept1)



In [0]:
emp_dept2=emp_dept1.withColumn('salary_year',year(col('salary_date')))\
                    .withColumn('salary_month',month(col('salary_date')))\
                    .drop('salarydt')
emp_dept3=emp_dept2.select("Empid", "EmpName","deptid","salary_year","salary_month","salary","Mgrid")
display(emp_dept2)    
display(emp_dept3)



In [0]:
emp_dept4=emp_dept3.groupBy('salary_month','Empid','EmpName','deptid','Mgrid').sum('salary')
emp_dept4.show()



In [0]:
emp_dept_rv=emp_dept4.join(df_dept,emp_dept4.deptid==df_dept.deptid,'inner').drop(df_dept.deptid)
display(emp_dept_rv)



In [0]:
df_rv=emp_dept_rv.alias('a').join(emp_dept_rv.alias('b'),col('a.Mgrid')==col('b.EmpId'),'left').select(
    col('a.deptname'),
    col('b.EmpName').alias('ManagerName'),
    col('a.EmpName'),
    col('a.salary_month'),
    col('a.sum(salary)')
)
display(df_rv)



In [0]:
## q2
dept_data=[('IT','M'),('IT','F'),('IT','M'),('IT','M'),
           ('HR','F'),('HR','F'),('HR','F'),('HR','F'),('HR','F'),
           ('SALES','M'),('SALES','M'),('SALES','F')]

dept_schema=['Deptname','Gender']    

dept_df=spark.createDataFrame(dept_data,dept_schema)
display(dept_df)



In [0]:
from pyspark.sql.functions import when,col

dept_df_rv=dept_df.select('Deptname',when(col('Gender')=='M',1).alias('Male'),when(col('Gender')=='F',1).alias('Female'))

display(dept_df_rv)



In [0]:
from pyspark.sql.functions import count,sum

display(
        dept_df_rv.groupBy('Deptname').agg(count('Deptname').alias('TotaCount'),\
            sum('Male').alias('Malecount'),sum('Female').alias('Femalecount'))
        )



In [0]:
#q3
data=[(1,'John','ADF'),(1,'John','ADB'),(1,'John','PowerBI'),(2,'Joanne','ADF'),(2,'Joanne','SQL'),(2,'Joanne','Crystal Report'),(3,'Vikas','ADF'),(3,'Vikas','SQL'),(3,'Vikas','SSIS'),(4,'Monu','SQL'),(4,'Monu','SSIS'),(4,'Monu','SSAS'),(4,'Monu','ADF')]
schema=["EmpId","EmpName","Skill"]
df1=spark.createDataFrame(data,schema)
display(df1)



In [0]:
from pyspark.sql.functions import collect_list,concat_ws

df22=df1.groupBy('EmpName').agg(collect_list('Skill').alias('skills'))
display(df22)



In [0]:
display(df22.select(df22.EmpName,concat_ws(',',df22.skills).alias('skills')))



### Q4-Amazon-Remove all reversed pair

In [0]:

from pyspark.sql.types import *
data=[(1,2),(3,2),(2,4),(2,1),(5,6),(4,2)]
schema=['A','B']
emp_df=spark.createDataFrame(data,schema)
display(emp_df)



In [0]:
##1st Approach-- By having min max value
from pyspark.sql.functions import least, greatest, col

emp_df1 = emp_df.withColumn('min_val', least(col('A'), col('B'))) \
                .withColumn('max_val', greatest(col('A'), col('B')))

emp_df_unique = emp_df1.select('min_val', 'max_val').dropDuplicates() \
                       .withColumnRenamed('min_val', 'A') \
                       .withColumnRenamed('max_val', 'B')

emp_df_unique.show()




In [0]:
##2nd Approach- By self join

emp_df_joined = (
    emp_df.alias('e')
    .join(
        emp_df.alias('g'),
        (col('e.A') == col('g.B')) & (col('e.B') == col('g.A')),
        'left_outer'
    )
    .filter((col('g.A').isNull()) | (col('e.A') < col('e.B')))
    .select(col('e.A').alias('A'), col('e.B').alias('B'))
)
emp_df_joined.show()




### Q5-Amazon-Total amount recieved via cash and online merchant

In [0]:
data=[('2022-04-02','merchant_1',150,'cash'),('2022-04-02','merchant_1',500,'online'),('2022-04-03','merchant_2',450,'online'),('2022-04-03','merchant_1',100,'cash'), ('2022-04-03','merchant_3',600,'cash'),('2022-04-05','merchant_5',200,'online'),('2022-04-05','merchant_2',100,'online')]
headers=['trx_date', 'merchant', 'amount', 'pay_mode']



In [0]:
trx_df=spark.createDataFrame(data,headers)
trx_df.show()



In [0]:
trx_df1=(
   trx_df.groupBy('merchant').pivot('pay_mode').sum('amount') 
)
trx_df1.show()



### Q6-Google-Unique user count

In [0]:
data=[('2022-02-20',1,"abc"),('2022-02-20',2,"xyz"),('2022-02-22',1,"xyz"),('2022-02-22',3,"klm"),('2022-02-24',1,"abc"),
('2022-02-24',2,"abc"),('2022-02-24',3,"abc")]
schema=['date','user_id','activity']

user_df=spark.createDataFrame(data,schema)
user_df.show()
    



In [0]:
from pyspark.sql.window import Window

window_spec=Window.partitionBy('user_id').orderBy('date')

display(user_df.withColumn('rnum',row_number().over(window_spec)))



**Q7-Google-Content_word_question**
_--count the words which has occurence more that 1 across all files_

In [0]:
data = [('python bootcamp1.txt','python for data analytics 0 to hero bootcamp starting on Jan 6th')
,('python bootcamp2.txt','classes will be held on weekends from 11am to 1 pm for 5-6 weeks')
,('python bootcamp3.txt','use code NY2024 to get 33 percent off. You can register from namaste sql website. Link in pinned comment')]

schema = ["filename" , "content"]

df = spark.createDataFrame(data = data , schema = schema)

df.show(truncate=False)



In [0]:
import pyspark.sql.functions as F
df1=df.withColumn('wordsarray',F.split(df['content']," ")).drop('content')
df1.show(truncate=False)



In [0]:
df2=df1.withColumn('words',F.explode(F.col('wordsarray'))).drop('wordsarray')
df2.show()



In [0]:
df3=(
    df2.groupBy('words').agg(F.count('*').alias('number_of_words'))
)
df3.filter(F.col('number_of_words')> 1).show()



**Q7-TCS-Find the order whose sales increased every year**

In [0]:
sales_data = [
 (1, 2019, 1000.00),
 (1, 2020, 1200.00),
 (1, 2021, 1100.00),
 (2, 2019, 500.00),
 (2, 2020, 600.00),
 (2, 2021, 900.00),
 (3, 2019, 300.00),
 (3, 2020, 450.00),
 (3, 2021, 400.00)
 ]

sales_schema = ['product_id', 'year', 'total_sales_revenue']

sales_df = spark.createDataFrame(data = sales_data , schema = sales_schema)
sales_df.show()



In [0]:
product_data = [
 (1, 'Laptops', 'Electronics'),
 (2, 'Jeans', 'Clothing'),
 (3, 'Chairs', 'Home Appliances')
 ]

product_schema = ['product_id', 'product_name', 'category']

product_df = spark.createDataFrame(data = product_data , schema = product_schema)
product_df.show()



In [0]:
df_prod_joined=(
    sales_df.alias('s').join(product_df.alias('p') , F.col('s.product_id')==F.col('p.product_id'),"inner")
    .drop(F.col('p.product_id'))
)
df_prod_joined.show()



In [0]:
from pyspark.sql.window import Window

windspec=Window.partitionBy('product_id').orderBy('year')



In [0]:
df_order=(
    df_prod_joined
    .withColumn('prev_sales',F.lag(F.col('total_sales_revenue'),1,0).over(windspec))
    .withColumn('sales_diff',(F.col('total_sales_revenue') - F.col('prev_sales')))
)

df_order.show()



In [0]:
order_grouped=(
   df_order.groupBy('product_id')
   .agg(F.min('sales_diff').alias('min_')) 
)
order_grouped.show()



In [0]:
df_order_final=(
    order_grouped.join(product_df,order_grouped.product_id== product_df.product_id)
    .filter(F.col('min_')>0)
    .drop(product_df.product_id,order_grouped.min_)
    #.select('product_id','product_name','category')
)
df_order_final.show()



**Q8-Infosys-flatten json**

In [0]:
data=""" 
{
 "Institute_Name" : "ABC_Coaching_Center",
 "Course_type" : "Best_seller" ,
 "branches" : [
  {
   "State" : "Maharashtra",
   "City" : "Mumbai",
   "address" : "XYZ"
  },
  {
   "State" : "Gujrat",
   "City" : "Surat",
   "address" : "PQRX"
  }
 ],
 "Head_Office_Contact" : 8787878787
}
"""



In [0]:
from pyspark.sql.types import StringType,StructField,StructType,IntegerType,ArrayType,MapType

schema=StructType(
    [
     StructField('Institute_Name',StringType(),False),
     StructField('Course_type',StringType(),False),
     StructField('Head_Office_Contact',StringType(),False),
     #StructField('branches',ArrayType(StructType([
          #      StructField('State', StringType(), True),
           #     StructField('City', StringType(), True),
           #     StructField('address', StringType(), True)
          #  ]),False)
     StructField('branches',ArrayType((MapType(StringType(), StringType(), False)),False)             
                 
     )
     ]
)



In [0]:
df=spark.read.format('json')\
    .option('multiline',True)\
    .option('schema',schema)\
    .load('dbfs:/FileStore/tables/institute.txt')    
df.show(truncate=False)



In [0]:
df.printSchema()



In [0]:
df=df.withColumn('branches',F.explode(F.col('branches')))
df.show(truncate=False)



##Remove redundant pairs##


In [0]:
data = [
    ('apple', 'samsung', 2020, 1, 2, 1, 2),
    ('samsung', 'apple', 2020, 1, 2, 1, 2),
    ('apple', 'samsung', 2021, 1, 2, 5, 3),
    ('samsung', 'apple', 2021, 5, 3, 1, 2),
    ('google', None, 2020, 5, 9, None, None),
    ('oneplus', 'nothing', 2020, 5, 9, 6, 3)
]
schema = 'brand1 string , brand2 string , year int , custom1 int, custom2 int , custom3 int , custom4 int'

df = spark.createDataFrame(data = data , schema = schema)
df.show()

+-------+-------+----+-------+-------+-------+-------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|
+-------+-------+----+-------+-------+-------+-------+
|  apple|samsung|2020|      1|      2|      1|      2|
|samsung|  apple|2020|      1|      2|      1|      2|
|  apple|samsung|2021|      1|      2|      5|      3|
|samsung|  apple|2021|      5|      3|      1|      2|
| google|   NULL|2020|      5|      9|   NULL|   NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|
+-------+-------+----+-------+-------+-------+-------+



In [0]:
def sort_values(val):
    if val:
        return "".join(sorted(val))
    else:
        return val

In [0]:
#sort_values('samsungapple2020')
sort_udf=udf(sort_values)

In [0]:
import pyspark.sql.functions as F

df_pair=df.withColumn('pairid' ,F.concat(F.col('brand1'),F.col('brand2'),F.col('year')) )
#df_pair=df.withColumn('pairid' ,F.sort )
df_pair.show()

+-------+-------+----+-------+-------+-------+-------+------------------+
| brand1| brand2|year|custom1|custom2|custom3|custom4|            pairid|
+-------+-------+----+-------+-------+-------+-------+------------------+
|  apple|samsung|2020|      1|      2|      1|      2|  applesamsung2020|
|samsung|  apple|2020|      1|      2|      1|      2|  samsungapple2020|
|  apple|samsung|2021|      1|      2|      5|      3|  applesamsung2021|
|samsung|  apple|2021|      5|      3|      1|      2|  samsungapple2021|
| google|   NULL|2020|      5|      9|   NULL|   NULL|              NULL|
|oneplus|nothing|2020|      5|      9|      6|      3|oneplusnothing2020|
+-------+-------+----+-------+-------+-------+-------+------------------+



In [0]:
df_pair=df_pair.withColumn('pairid',sort_udf(F.col('pairid'))).drop('brand1','brand2','year')
df_pair.show()

+-------+-------+-------+-------+------------------+
|custom1|custom2|custom3|custom4|            pairid|
+-------+-------+-------+-------+------------------+
|      1|      2|      1|      2|  0022aaeglmnppssu|
|      1|      2|      1|      2|  0022aaeglmnppssu|
|      1|      2|      5|      3|  0122aaeglmnppssu|
|      5|      3|      1|      2|  0122aaeglmnppssu|
|      5|      9|   NULL|   NULL|              NULL|
|      5|      9|      6|      3|0022eghilnnnoopstu|
+-------+-------+-------+-------+------------------+



In [0]:
from pyspark.sql.window import Window

winspec=Window.partitionBy('pairid').orderBy('pairid')

In [0]:
df_pair=df_pair.withColumn('row_num',F.row_number().over(winspec))
df_pair.show()

+-------+-------+-------+-------+------------------+-------+
|custom1|custom2|custom3|custom4|            pairid|row_num|
+-------+-------+-------+-------+------------------+-------+
|      5|      9|   NULL|   NULL|              NULL|      1|
|      1|      2|      1|      2|  0022aaeglmnppssu|      1|
|      1|      2|      1|      2|  0022aaeglmnppssu|      2|
|      5|      9|      6|      3|0022eghilnnnoopstu|      1|
|      1|      2|      5|      3|  0122aaeglmnppssu|      1|
|      5|      3|      1|      2|  0122aaeglmnppssu|      2|
+-------+-------+-------+-------+------------------+-------+



In [0]:
df_pair=(
    df_pair.filter((F.col("row_num")==1 ) | ((F.col('custom1') !=F.col('custom3')) & (F.col('custom2') !=F.col('custom4')) ))
    
    )
df_pair.show()

+-------+-------+-------+-------+------------------+-------+
|custom1|custom2|custom3|custom4|            pairid|row_num|
+-------+-------+-------+-------+------------------+-------+
|      5|      9|   NULL|   NULL|              NULL|      1|
|      1|      2|      1|      2|  0022aaeglmnppssu|      1|
|      5|      9|      6|      3|0022eghilnnnoopstu|      1|
|      1|      2|      5|      3|  0122aaeglmnppssu|      1|
|      5|      3|      1|      2|  0122aaeglmnppssu|      2|
+-------+-------+-------+-------+------------------+-------+



**Calculate percentage increase**

In [0]:
data = [
    (20124 ,'2020-01-10'),
    (40133 ,'2020-01-15'),
    (65005 ,'2020-01-20'),
    (30005 ,'2020-02-08'),
    (35015 ,'2020-02-19'),
    (15015 ,'2020-03-03'),
    (35035 ,'2020-03-10'),
    (49099 ,'2020-03-14'),
    (84045 ,'2020-03-20'),
    (100106 ,'2020-03-31'),
    (17015 ,'2020-04-04'),
    (36035 ,'2020-04-11'),
    (50099 ,'2020-04-13'),
    (87045 ,'2020-04-22'),
    (101101 ,'2020-04-30'),
    (40015 ,'2020-05-01'),
    (54035 ,'2020-05-09'),
    (71099 ,'2020-05-14'),
    (82045 ,'2020-05-21'),
    (90103 ,'2020-05-25'),
    (99103 ,'2020-05-31'),
    (11015 ,'2020-06-03'),
    (28035 ,'2020-06-10'),
    (38099 ,'2020-06-14'),
    (45045 ,'2020-06-20'),
    (36033 ,'2020-07-09'),
    (40011 ,'2020-07-23'),
    (25001 ,'2020-08-12'),
    (29990 ,'2020-08-26'),
    (20112 ,'2020-09-04'),
    (43991 ,'2020-09-18'),
    (51002 ,'2020-09-29'),
    (26587 ,'2020-10-25'),
    (11000 ,'2020-11-07'),
    (35002 ,'2020-11-16'),
    (56010 ,'2020-11-28'),
    (15099 ,'2020-12-02'),
    (38042 ,'2020-12-11'),
    (73030 ,'2020-12-26')
]

schema = "cases_reported int , dates string"
df = spark.createDataFrame(data= data , schema = schema)
df.show(10)

+--------------+----------+
|cases_reported|     dates|
+--------------+----------+
|         20124|2020-01-10|
|         40133|2020-01-15|
|         65005|2020-01-20|
|         30005|2020-02-08|
|         35015|2020-02-19|
|         15015|2020-03-03|
|         35035|2020-03-10|
|         49099|2020-03-14|
|         84045|2020-03-20|
|        100106|2020-03-31|
+--------------+----------+
only showing top 10 rows



In [0]:
df.printSchema()

root
 |-- cases_reported: integer (nullable = true)
 |-- dates: string (nullable = true)



In [0]:
df=df.withColumn('dates',F.to_date(F.col('dates'), 'yyyy-MM-dd'))
df.show(11)

+--------------+----------+
|cases_reported|     dates|
+--------------+----------+
|         20124|2020-01-10|
|         40133|2020-01-15|
|         65005|2020-01-20|
|         30005|2020-02-08|
|         35015|2020-02-19|
|         15015|2020-03-03|
|         35035|2020-03-10|
|         49099|2020-03-14|
|         84045|2020-03-20|
|        100106|2020-03-31|
|         17015|2020-04-04|
+--------------+----------+
only showing top 11 rows



In [0]:
df=df.withColumn('month',F.month(F.col('dates')))
df.show(10)

+--------------+----------+-----+
|cases_reported|     dates|month|
+--------------+----------+-----+
|         20124|2020-01-10|    1|
|         40133|2020-01-15|    1|
|         65005|2020-01-20|    1|
|         30005|2020-02-08|    2|
|         35015|2020-02-19|    2|
|         15015|2020-03-03|    3|
|         35035|2020-03-10|    3|
|         49099|2020-03-14|    3|
|         84045|2020-03-20|    3|
|        100106|2020-03-31|    3|
+--------------+----------+-----+
only showing top 10 rows



In [0]:
df_grouped=df.groupBy('month').agg((sum('cases_reported')).alias('total_cases'))
df_grouped.show()

+-----+-----------+
|month|total_cases|
+-----+-----------+
|    1|     125262|
|    2|      65020|
|    3|     283300|
|    4|     291295|
|    5|     436400|
|    6|     122194|
|    8|      54991|
|    7|      76044|
|    9|     115105|
|   12|     126171|
|   10|      26587|
|   11|     102012|
+-----+-----------+



In [0]:
df_grouped.createOrReplaceTempView('casesdata')

In [0]:
%sql
with dataset as 
(
select *,
sum(total_cases) over(ORDER BY MONTH ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) running_sum 
 from casesdata
 ),
 final_data as (
 select *
 ,LAG(running_sum,1,0) over(order by month) as prev_sum
  from dataset
 )

 SELECT * 
 ,((total_cases/prev_sum)*100):: decimal(20,3)  as percentage_inc
 from final_data

month,total_cases,running_sum,prev_sum,percentage_inc
1,125262,125262,0,
2,65020,190282,125262,51.907
3,283300,473582,190282,148.884
4,291295,764877,473582,61.509
5,436400,1201277,764877,57.055
6,122194,1323471,1201277,10.172
7,76044,1399515,1323471,5.746
8,54991,1454506,1399515,3.929
9,115105,1569611,1454506,7.914
10,26587,1596198,1569611,1.694


**Return the user who has loggedin for consecutive 5 or more days**

In [0]:
data = [
(1, '2024-03-01'),
(1, '2024-03-02'),
(1, '2024-03-03'),
(1, '2024-03-04'),
(1, '2024-03-06'),
(1, '2024-03-10'),
(1, '2024-03-11'),
(1, '2024-03-12'),
(1, '2024-03-13'),
(1, '2024-03-14'),
(1, '2024-03-20'),
(1, '2024-03-25'),
(1, '2024-03-26'),
(1, '2024-03-27'),
(1, '2024-03-28'),
(1, '2024-03-29'),
(1, '2024-03-30'),
(2, '2024-03-01'),
(2, '2024-03-02'),
(2, '2024-03-03'),
(2, '2024-03-04'),
(3, '2024-03-01'),
(3, '2024-03-02'),
(3, '2024-03-03'),
(3, '2024-03-04'),
(3, '2024-03-04'),
(3, '2024-03-04'),
(3, '2024-03-05'),
(4, '2024-03-01'),
(4, '2024-03-02'),
(4, '2024-03-03'),
(4, '2024-03-04'),
(4, '2024-03-04')
]

schema = "user_id int , login_date string"

df = spark.createDataFrame(data = data , schema = schema)
df.display()

user_id,login_date
1,2024-03-01
1,2024-03-02
1,2024-03-03
1,2024-03-04
1,2024-03-06
1,2024-03-10
1,2024-03-11
1,2024-03-12
1,2024-03-13
1,2024-03-14


In [0]:
df_user  =(
    df.withColumn('prevday_login',lag('login_date').over(Window.partitionBy('user_id').orderBy('login_date')))
    #.withColumn('prev_day',F.date_add(F.col('login_date'),-1))
    .withColumn('rownum',F.row_number().over(Window.partitionBy('user_id').orderBy('login_date')))
    #.withColumn('prevday_login', when(F.col('prevday_login')==None,lit('1999-01-01'))
              # .otherwise(F.col('prevday_login')) )
)
display(df_user)

user_id,login_date,prevday_login,rownum
1,2024-03-01,,1
1,2024-03-02,2024-03-01,2
1,2024-03-03,2024-03-02,3
1,2024-03-04,2024-03-03,4
1,2024-03-06,2024-03-04,5
1,2024-03-10,2024-03-06,6
1,2024-03-11,2024-03-10,7
1,2024-03-12,2024-03-11,8
1,2024-03-13,2024-03-12,9
1,2024-03-14,2024-03-13,10


In [0]:
df_user=(
    df_user.withColumn('prevday_login', when(F.col('rownum')==1,lit('1999-01-01'))
               .otherwise(F.col('prevday_login')) )
               )

display(df_user)               

user_id,login_date,prevday_login,rownum
1,2024-03-01,1999-01-01,1
1,2024-03-02,2024-03-01,2
1,2024-03-03,2024-03-02,3
1,2024-03-04,2024-03-03,4
1,2024-03-06,2024-03-04,5
1,2024-03-10,2024-03-06,6
1,2024-03-11,2024-03-10,7
1,2024-03-12,2024-03-11,8
1,2024-03-13,2024-03-12,9
1,2024-03-14,2024-03-13,10


In [0]:
df_user_final=df_user.filter((F.col('login_date') != F.col('prevday_login')))
display(df_user_final)

user_id,login_date,prevday_login,rownum
1,2024-03-01,1999-01-01,1
1,2024-03-02,2024-03-01,2
1,2024-03-03,2024-03-02,3
1,2024-03-04,2024-03-03,4
1,2024-03-06,2024-03-04,5
1,2024-03-10,2024-03-06,6
1,2024-03-11,2024-03-10,7
1,2024-03-12,2024-03-11,8
1,2024-03-13,2024-03-12,9
1,2024-03-14,2024-03-13,10
