In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
## Question1:Find the Median salary for each department

In [0]:
cols=['Employee',	'Department',	'Salary']

data=[('Alice','HR','5000'),('Bob','HR','6000'),('Charlie','HR','6000'),('David','IT','8000'),('Suzan','IT','8500'),
('Fred','IT','8500'),('Eve','IT','8500'),('Frank','IT','9000'),('Ivan','Sales','9000'),('Judy','Sales','6500'),
('Cris','Sales','9000'),('Garry','Sales','6000')]

In [0]:
df_src=spark.createDataFrame(data,cols)
df_src.show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|   Alice|        HR|  5000|
|     Bob|        HR|  6000|
| Charlie|        HR|  6000|
|   David|        IT|  8000|
|   Suzan|        IT|  8500|
|    Fred|        IT|  8500|
|     Eve|        IT|  8500|
|   Frank|        IT|  9000|
|    Ivan|     Sales|  9000|
|    Judy|     Sales|  6500|
|    Cris|     Sales|  9000|
|   Garry|     Sales|  6000|
+--------+----------+------+



In [0]:
from pyspark.sql.functions import *  
from pyspark.sql.window import Window

In [0]:
# Define a window partitioned by department and ordered by salary
window_spec = Window.partitionBy("Department").orderBy("Salary")

# Add a row number and count of salaries within each department
df_with_row = df_src.withColumn("row_num", row_number().over(window_spec))\
                .withColumn("count", count("Salary").over(Window.partitionBy("Department")))

df_with_row.display()

Employee,Department,Salary,row_num,count
Alice,HR,5000,1,3
Bob,HR,6000,2,3
Charlie,HR,6000,3,3
David,IT,8000,1,5
Suzan,IT,8500,2,5
Fred,IT,8500,3,5
Eve,IT,8500,4,5
Frank,IT,9000,5,5
Garry,Sales,6000,1,4
Judy,Sales,6500,2,4


In [0]:
# Find the median position
# middle = middle index of the window. 3/2= 1.5 and ceil(1.5) ->
# next_to_middle = if the no of elements in window is even then middle index + 1

df_with_median = df_with_row.withColumn("middle", ceil(col("count") / 2))\
        .withColumn("next_to_middle", when(col("count")%2==0, col("middle")+1).otherwise(col("middle")))

df_with_median.display()        

Employee,Department,Salary,row_num,count,middle,next_to_middle
Alice,HR,5000,1,3,2,2
Bob,HR,6000,2,3,2,2
Charlie,HR,6000,3,3,2,2
David,IT,8000,1,5,3,3
Suzan,IT,8500,2,5,3,3
Fred,IT,8500,3,5,3,3
Eve,IT,8500,4,5,3,3
Frank,IT,9000,5,5,3,3
Garry,Sales,6000,1,4,2,3
Judy,Sales,6500,2,4,2,3


In [0]:
# Filter to get the median salary per department
median_pos_salary_df=df_with_median.filter((col('row_num')==col('middle')) | (col('row_num')==col('next_to_middle')))
median_pos_salary_df.display()
                                   

Employee,Department,Salary,row_num,count,middle,next_to_middle
Bob,HR,6000,2,3,2,2
Fred,IT,8500,3,5,3,3
Judy,Sales,6500,2,4,2,3
Ivan,Sales,9000,3,4,2,3


In [0]:
final_result = median_pos_salary_df.groupBy("department").agg(ceil(avg("salary"))
                                                              .alias("median_salary"))
final_result.display()  

department,median_salary
HR,6000
IT,8500
Sales,7750


In [0]:
##Find customers who have placed order on consecutive days

In [0]:
data = [
    (1, '2024-10-01'),
    (1, '2024-10-02'),
    (1, '2024-10-04'),
    (2, '2024-10-03'),
    (2, '2024-10-05'),
    (3, '2024-10-01'),
    (3, '2024-10-02'),
    (3, '2024-10-03'),
]

# Create DataFrame
df = spark.createDataFrame(data, ["customer_id", "order_date"])
df.display()

customer_id,order_date
1,2024-10-01
1,2024-10-02
1,2024-10-04
2,2024-10-03
2,2024-10-05
3,2024-10-01
3,2024-10-02
3,2024-10-03


In [0]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)



In [0]:
df=df.withColumn('order_date',to_date(col('order_date'),'yyyy-MM-dd'))

In [0]:
cust_wind=Window.partitionBy('customer_id').orderBy('order_date')

df1=(
    df.withColumn('prev_day',lag(col('order_date'),1,'1990_10_01').over(cust_wind))
           .withColumn('last_day',date_add(col('order_date'),-1))
           .withColumn('same',when(col('prev_day') ==col('last_day'), lit('yes')).otherwise(lit('no')))
          # .withColumn()
)

df1.display()

customer_id,order_date,prev_day,last_day,same
1,2024-10-01,,2024-09-30,no
1,2024-10-02,2024-10-01,2024-10-01,yes
1,2024-10-04,2024-10-02,2024-10-03,no
2,2024-10-03,,2024-10-02,no
2,2024-10-05,2024-10-03,2024-10-04,no
3,2024-10-01,,2024-09-30,no
3,2024-10-02,2024-10-01,2024-10-01,yes
3,2024-10-03,2024-10-02,2024-10-02,yes


In [0]:
df1_new=df1.filter(col('same')=="yes")
df1_new.display()

customer_id,order_date,prev_day,last_day,same
1,2024-10-02,2024-10-01,2024-10-01,yes
3,2024-10-02,2024-10-01,2024-10-01,yes
3,2024-10-03,2024-10-02,2024-10-02,yes


In [0]:
#  You need to calculate:
#  1. the total amount spent by each customer on products within a given year
#  2. identify the top 2 products they spent the most money on. Additionally
#  3. return the overall total spending of each customer across all year.

In [0]:
data = [
    (1, "2023-01-15", "A", 100.0),
    (1, "2023-03-10", "B", 150.0),
    (2, "2023-02-05", "A", 50.0),
    (1, "2023-02-20", "A", 200.0),
    (2, "2023-04-30", "C", 300.0),
    (1, "2022-05-12", "B", 120.0),
    (2, "2022-09-22", "A", 200.0),
    (1, "2023-05-15", "C", 250.0),
]

# Create DataFrame
columns = ["customer_id", "transaction_date", "product_id", "transaction_amount"]
df_cust = spark.createDataFrame(data, columns)
df_cust.display()

customer_id,transaction_date,product_id,transaction_amount
1,2023-01-15,A,100.0
1,2023-03-10,B,150.0
2,2023-02-05,A,50.0
1,2023-02-20,A,200.0
2,2023-04-30,C,300.0
1,2022-05-12,B,120.0
2,2022-09-22,A,200.0
1,2023-05-15,C,250.0


In [0]:
df_cust.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- transaction_amount: double (nullable = true)



In [0]:
display(df_cust.withColumn('total_amount',sum('transaction_amount')
                           .over(Window.partitionBy('customer_id'))))

customer_id,transaction_date,product_id,transaction_amount,total_amount
1,2023-01-15,A,100.0,820.0
1,2023-03-10,B,150.0,820.0
1,2023-02-20,A,200.0,820.0
1,2022-05-12,B,120.0,820.0
1,2023-05-15,C,250.0,820.0
2,2023-02-05,A,50.0,550.0
2,2023-04-30,C,300.0,550.0
2,2022-09-22,A,200.0,550.0


In [0]:
## ICC team and winner

In [0]:
data = [
    ("India", "Australia", "India"),
    ("England", "Pakistan", "England"),
    ("South Africa", "New Zealand", "New Zealand"),
    ("West Indies", "Sri Lanka", "Sri Lanka"),
    ("India", "Pakistan", "India"),
    ("Australia", "England", "Australia"),
    ("New Zealand", "Sri Lanka", "New Zealand"),
    ("South Africa", "West Indies", "South Africa"),
    ("India", "England", "India"),
    ("Pakistan", "Australia", "Pakistan"),
]

columns = ["Team_1", "Team_2", "Winner"]

df = spark.createDataFrame(data, columns)
df.display()

Team_1,Team_2,Winner
India,Australia,India
England,Pakistan,England
South Africa,New Zealand,New Zealand
West Indies,Sri Lanka,Sri Lanka
India,Pakistan,India
Australia,England,Australia
New Zealand,Sri Lanka,New Zealand
South Africa,West Indies,South Africa
India,England,India
Pakistan,Australia,Pakistan


In [0]:
df_team=(
    df.select(col('Team_1').alias('team'),col('Winner'))
    .union( df.select(col('Team_2').alias('team'),col('Winner')))
         )

df_team.display()         

team,Winner
India,India
England,England
South Africa,New Zealand
West Indies,Sri Lanka
India,India
Australia,Australia
New Zealand,New Zealand
South Africa,South Africa
India,India
Pakistan,Pakistan


In [0]:
# Flatten & Explode 

In [0]:
cols=['id','name','skills']

data=[(1,'Alice',['python','flask','C#'])
       ,(2,'Bob',['java','ADF'])
         ,(3,'Charlie',['Dtabricks','Snowflake','PySpark'])
      ]

sub_df=spark.createDataFrame(data,cols)    
sub_df.show(truncate=False)  

+---+-------+-------------------------------+
|id |name   |skills                         |
+---+-------+-------------------------------+
|1  |Alice  |[python, flask, C#]            |
|2  |Bob    |[java, ADF]                    |
|3  |Charlie|[Dtabricks, Snowflake, PySpark]|
+---+-------+-------------------------------+



In [0]:
display(
    sub_df.withColumn('subject',explode(col('skills')))
)

id,name,skills,subject
1,Alice,"List(python, flask, C#)",python
1,Alice,"List(python, flask, C#)",flask
1,Alice,"List(python, flask, C#)",C#
2,Bob,"List(java, ADF)",java
2,Bob,"List(java, ADF)",ADF
3,Charlie,"List(Dtabricks, Snowflake, PySpark)",Dtabricks
3,Charlie,"List(Dtabricks, Snowflake, PySpark)",Snowflake
3,Charlie,"List(Dtabricks, Snowflake, PySpark)",PySpark


In [0]:
# percentage diff of sales from prev month

data=[(1,'2024-01-01',"I1",10,1000),(2,"2024-01-15","I2",20,2000),(3,"2024-02-01","I3",10,1500),(4,"2024-02-15","I4",20,2500),(5,"2024-03-01","I5",30,3000),(6,"2024-03-10","I6",40,3500),(7,"2024-03-20","I7",20,2500),(8,"2024-03-30","I8",10,1000)]
schema=["SOId","SODate","ItemId","ItemQty","ItemValue"]
df1=spark.createDataFrame(data,schema)
display(df1)

SOId,SODate,ItemId,ItemQty,ItemValue
1,2024-01-01,I1,10,1000
2,2024-01-15,I2,20,2000
3,2024-02-01,I3,10,1500
4,2024-02-15,I4,20,2500
5,2024-03-01,I5,30,3000
6,2024-03-10,I6,40,3500
7,2024-03-20,I7,20,2500
8,2024-03-30,I8,10,1000


In [0]:
df1=df1.withColumn("SODate",df1.SODate.cast(DateType()))
df1.printSchema()

root
 |-- SOId: long (nullable = true)
 |-- SODate: date (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- ItemQty: long (nullable = true)
 |-- ItemValue: long (nullable = true)



In [0]:
df2=(
    df1.withColumn('year',year('SODate'))
        .withColumn('month',month('SODate'))
       # .withColumn('total',(col('ItemQty') * col('ItemValue')))
        .withColumn('total',( col('ItemValue')))
)
df2.display()

SOId,SODate,ItemId,ItemQty,ItemValue,year,month,total
1,2024-01-01,I1,10,1000,2024,1,1000
2,2024-01-15,I2,20,2000,2024,1,2000
3,2024-02-01,I3,10,1500,2024,2,1500
4,2024-02-15,I4,20,2500,2024,2,2500
5,2024-03-01,I5,30,3000,2024,3,3000
6,2024-03-10,I6,40,3500,2024,3,3500
7,2024-03-20,I7,20,2500,2024,3,2500
8,2024-03-30,I8,10,1000,2024,3,1000


In [0]:
df3=(
    df2.groupBy('year','month').agg(sum(col('total')).alias('total_sale'))
    )
df3.display()


year,month,total_sale
2024,1,3000
2024,2,4000
2024,3,10000


In [0]:
sale_window=Window.partitionBy('year').orderBy('month')

In [0]:
df4=(
    df3.withColumn('prevmonthsale',lag(col('total_sale'),1).over(sale_window))
)
df4.display()

year,month,total_sale,prevmonthsale
2024,1,3000,
2024,2,4000,3000.0
2024,3,10000,4000.0


In [0]:
df5=(
    df4.withColumn('percntgae_diff', ((col('total_sale') - col('prevmonthsale'))*100 /col('total_sale') ))
)
df5.display()

year,month,total_sale,prevmonthsale,percntgae_diff
2024,1,3000,,
2024,2,4000,3000.0,25.0
2024,3,10000,4000.0,60.0


In [0]:
#collect_list()

In [0]:
data=[(1,'John','ADF'),(1,'John','ADB'),(1,'John','PowerBI'),(2,'Joanne','ADF'),(2,'Joanne','SQL'),(2,'Joanne','Crystal Report'),(3,'Vikas','ADF'),(3,'Vikas','SQL'),(3,'Vikas','SSIS'),(4,'Monu','SQL'),(4,'Monu','SSIS'),(4,'Monu','SSAS'),(4,'Monu','ADF')]
schema=["EmpId","EmpName","Skill"]
df1=spark.createDataFrame(data,schema)
display(df1)

EmpId,EmpName,Skill
1,John,ADF
1,John,ADB
1,John,PowerBI
2,Joanne,ADF
2,Joanne,SQL
2,Joanne,Crystal Report
3,Vikas,ADF
3,Vikas,SQL
3,Vikas,SSIS
4,Monu,SQL


In [0]:
df2=(
    df1.groupBy('EmpId','EmpName').agg(collect_list(col('Skill')).alias('skills_array'))
)
display(df2)

EmpId,EmpName,skills_array
1,John,"List(ADF, ADB, PowerBI)"
2,Joanne,"List(ADF, SQL, Crystal Report)"
3,Vikas,"List(ADF, SQL, SSIS)"
4,Monu,"List(SQL, SSIS, SSAS, ADF)"


In [0]:
help(concat_ws)

Help on function concat_ws in module pyspark.sql.functions:

concat_ws(sep: str, *cols: 'ColumnOrName') -> pyspark.sql.column.Column
    Concatenates multiple input string columns together into a single string column,
    using the given separator.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    sep : str
        words separator.
    cols : :class:`~pyspark.sql.Column` or str
        list of columns to work on.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        string of concatenated words.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()
    [Row(s='abcd-123')]



In [0]:
df3=(
    df2.withColumn('all_skills',concat_ws(',',col('skills_array')))
)
display(df3)

EmpId,EmpName,skills_array,all_skills
1,John,"List(ADF, ADB, PowerBI)","ADF,ADB,PowerBI"
2,Joanne,"List(ADF, SQL, Crystal Report)","ADF,SQL,Crystal Report"
3,Vikas,"List(ADF, SQL, SSIS)","ADF,SQL,SSIS"
4,Monu,"List(SQL, SSIS, SSAS, ADF)","SQL,SSIS,SSAS,ADF"


In [0]:
#Count total no. of employees

In [0]:
dept_data=[('IT','M'),('IT','F'),('IT','M'),('IT','M'),
           ('HR','F'),('HR','F'),('HR','F'),('HR','F'),('HR','F'),
           ('SALES','M'),('SALES','M'),('SALES','F')]

dept_schema=['Deptname','Gender']    

dept_df=spark.createDataFrame(dept_data,dept_schema)
display(dept_df)

Deptname,Gender
IT,M
IT,F
IT,M
IT,M
HR,F
HR,F
HR,F
HR,F
HR,F
SALES,M


In [0]:
dept_df=(
    dept_df.withColumn('Gender',when(col('Gender')=='M','Male')
                        .when(col('Gender')=='F','Female')
                        )
)

In [0]:
dept_df1=(
    dept_df.groupBy('Deptname').pivot('Gender').agg(count(col('Gender')))
)
dept_df1.show()

+--------+------+----+
|Deptname|Female|Male|
+--------+------+----+
|      HR|     5|null|
|   SALES|     1|   2|
|      IT|     1|   3|
+--------+------+----+



In [0]:
dept_df1=dept_df1.na.fill(0)
dept_df1.show()

+--------+------+----+
|Deptname|Female|Male|
+--------+------+----+
|      HR|     5|   0|
|   SALES|     1|   2|
|      IT|     1|   3|
+--------+------+----+



In [0]:
df_final=(
    dept_df1.withColumn('total',(col('Female')+col('Male')))
)
df_final.display()

Deptname,Female,Male,total
HR,5,0,5
SALES,1,2,3
IT,1,3,4


In [0]:
#3 or more consecutive records with people more than or equal to 100

In [0]:

cols=['id','visit_date','no_of_people']

data=[(1,'2017-07-01',10)
,(2,'2017-07-02',109)
,(3,'2017-07-03',150)
,(4,'2017-07-04',99)
,(5,'2017-07-05',145)
,(6,'2017-07-06',1455)
,(7,'2017-07-07',199)
,(8,'2017-07-08',188)
]

In [0]:
people_df=spark.createDataFrame(data,cols)
people_df.show()

+---+----------+------------+
| id|visit_date|no_of_people|
+---+----------+------------+
|  1|2017-07-01|          10|
|  2|2017-07-02|         109|
|  3|2017-07-03|         150|
|  4|2017-07-04|          99|
|  5|2017-07-05|         145|
|  6|2017-07-06|        1455|
|  7|2017-07-07|         199|
|  8|2017-07-08|         188|
+---+----------+------------+



In [0]:
people_df1=(
    people_df.filter(col('no_of_people')>=100)
    .withColumn('rnum',row_number().over(Window.orderBy('visit_date')))   
    .withColumn('grp',(col('id') - col('rnum')))
)
people_df1.show()

+---+----------+------------+----+---+
| id|visit_date|no_of_people|rnum|grp|
+---+----------+------------+----+---+
|  2|2017-07-02|         109|   1|  1|
|  3|2017-07-03|         150|   2|  1|
|  5|2017-07-05|         145|   3|  2|
|  6|2017-07-06|        1455|   4|  2|
|  7|2017-07-07|         199|   5|  2|
|  8|2017-07-08|         188|   6|  2|
+---+----------+------------+----+---+



In [0]:
final_df= (
    people_df1.groupBy('grp').count().alias('total_count')
)
final_df.show()

+---+-----+
|grp|count|
+---+-----+
|  1|    2|
|  2|    4|
+---+-----+



In [0]:
# Find Median salary of employee

In [0]:

cols=['emp_id','company','salary']
salary=[(1,'A',2341) ,(2,'A',341) ,(3,'A',15), (4,'A',15314) ,(5,'A',451) ,(6,'A',513), (7,'B',15), (8,'B',13), (9,'B',1154),
         (10,'B',1345), (11,'B',1221) ,(12,'B',234), (13,'C',2345), (14,'C',2645), (15,'C',2645), (16,'C',2652), (17,'C',65)	]

In [0]:
sal_df=spark.createDataFrame(salary,cols)
sal_df.show(10)

+------+-------+------+
|emp_id|company|salary|
+------+-------+------+
|     1|      A|  2341|
|     2|      A|   341|
|     3|      A|    15|
|     4|      A| 15314|
|     5|      A|   451|
|     6|      A|   513|
|     7|      B|    15|
|     8|      B|    13|
|     9|      B|  1154|
|    10|      B|  1345|
+------+-------+------+
only showing top 10 rows



In [0]:
sal_df1=(
   sal_df
   .withColumn('count',count(col('emp_id')).over(Window.partitionBy('company')))
   .withColumn('rnow',row_number().over(Window.partitionBy('company').orderBy('salary'))) 
   .withColumn('middle',ceil(col('count')/2)) 
   .withColumn('middle+1',when(col('count')%2==0,col('middle')+1).otherwise(col('middle')))
)

sal_df1.show()

+------+-------+------+-----+----+------+--------+
|emp_id|company|salary|count|rnow|middle|middle+1|
+------+-------+------+-----+----+------+--------+
|     3|      A|    15|    6|   1|     3|       4|
|     2|      A|   341|    6|   2|     3|       4|
|     5|      A|   451|    6|   3|     3|       4|
|     6|      A|   513|    6|   4|     3|       4|
|     1|      A|  2341|    6|   5|     3|       4|
|     4|      A| 15314|    6|   6|     3|       4|
|     8|      B|    13|    6|   1|     3|       4|
|     7|      B|    15|    6|   2|     3|       4|
|    12|      B|   234|    6|   3|     3|       4|
|     9|      B|  1154|    6|   4|     3|       4|
|    11|      B|  1221|    6|   5|     3|       4|
|    10|      B|  1345|    6|   6|     3|       4|
|    17|      C|    65|    5|   1|     3|       3|
|    13|      C|  2345|    5|   2|     3|       3|
|    14|      C|  2645|    5|   3|     3|       3|
|    15|      C|  2645|    5|   4|     3|       3|
|    16|      C|  2652|    5|  

In [0]:
final_df=(
    sal_df1.filter((col('rnow')==col('middle')) | (col('rnow')==col('middle+1')) )
)
final_df.show()

+------+-------+------+-----+----+------+--------+
|emp_id|company|salary|count|rnow|middle|middle+1|
+------+-------+------+-----+----+------+--------+
|     5|      A|   451|    6|   3|     3|       4|
|     6|      A|   513|    6|   4|     3|       4|
|    12|      B|   234|    6|   3|     3|       4|
|     9|      B|  1154|    6|   4|     3|       4|
|    14|      C|  2645|    5|   3|     3|       3|
+------+-------+------+-----+----+------+--------+



In [0]:
median_df=final_df.groupBy('company').agg(avg(col('salary')).alias('median'))
median_df.show()

+-------+------+
|company|median|
+-------+------+
|      A| 482.0|
|      B| 694.0|
|      C|2645.0|
+-------+------+



In [0]:
#Player Location

In [0]:

cols=['name','city']
player_data=[ ('Sachin','Mumbai'),('Virat','Delhi') , ('Rahul','Bangalore'),('Rohit','Mumbai'),('Mayank','Bangalore')]

player_df=spark.createDataFrame(player_data,cols)
player_df.show()

+------+---------+
|  name|     city|
+------+---------+
|Sachin|   Mumbai|
| Virat|    Delhi|
| Rahul|Bangalore|
| Rohit|   Mumbai|
|Mayank|Bangalore|
+------+---------+



In [0]:
player_df1=(
    #player_df.withColumn('rnow',row_number().over(Window.orderBy('city')))
    player_df.withColumn('rnow',lit('yes'))
)
player_df1.show()

+------+---------+----+
|  name|     city|rnow|
+------+---------+----+
|Sachin|   Mumbai| yes|
| Virat|    Delhi| yes|
| Rahul|Bangalore| yes|
| Rohit|   Mumbai| yes|
|Mayank|Bangalore| yes|
+------+---------+----+



In [0]:
player_df1.createOrReplaceTempView('player_table')

In [0]:
%sql

with dataset as (
select rnow,city,
pivot(count(city))
 from player_table
 group by rnow 

)

select * from dataset


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-1042775268348521>", line 7, in <module>
    _sqldf = ____databricks_percent_sql()
  File "<command-1042775268348521>", line 4, in ____databricks_percent_sql
    df = spark.sql(base64.standard_b64decode("d2l0aCBkYXRhc2V0IGFzICgKc2VsZWN0IHJub3csY2l0eSwKcGl2b3QoY291bnQoY2l0eSkpCiBmcm9tIHBsYXllcl90YWJsZQogZ3JvdXAgYnkgcm5vdyAKCikKCnNlbGVjdCAqIGZyb20gZGF0YXNldA==").decode())
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 48, in wrapper
    res = func(*args, **kwargs)
  File "/databricks/spark/python/pyspark/sql/session.py", line 1387, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
    return_value = get_return_value(




In [0]:
playre_grpd_df=(
    player_df1.groupBy('rnow').pivot('city').count('rnow')
)
playre_grpd_df.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-1042775268348516>:2[0m
[1;32m      1[0m playre_grpd_df[38;5;241m=[39m(
[0;32m----> 2[0m     player_df1[38;5;241m.[39mgroupBy([38;5;124m'[39m[38;5;124mrnow[39m[38;5;124m'[39m)[38;5;241m.[39mpivot([38;5;124m'[39m[38;5;124mcity[39m[38;5;124m'[39m)[38;5;241m.[39mcount([38;5;124m'[39m[38;5;124mrnow[39m[38;5;124m'[39m)
[1;32m      3[0m )
[1;32m      4[0m playre_grpd_df[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m