In [0]:
spark

-------------- Create Dataframe

In [0]:

emp_data = [
    (1,'manish',26,20000,'india','IT'),
    (2,'rahul',None,40000,'germany','engineering'),
    (3,'pawan',12,60000,'india','sales'),
    (4,'roshini',44,None,'uk','engineering'),
    (5,'raushan',35,70000,'india','sales'),
    (6,None,29,200000,'uk','IT'),
    (7,'adam',37,65000,'us','IT'),
    (8,'chris',16,40000,'us','sales'),
    (None,None,None,None,None,None),
    (7,'adam',37,65000,'us','IT')
]
schema = ['id', 'name', 'age', 'sal', 'country', 'dept']
df = spark.createDataFrame(emp_data, schema)
df.show()

# data for group by statement
group_by_data = [(1,'manish',50000,'IT'),
                (2,'vikash',60000,'sales'),
                (3,'raushan',70000,'marketing'),
                (4,'mukesh',80000,'IT'),
                (5,'pritam',90000,'sales'),
                (6,'nikita',45000,'marketing'),
                (7,'ragini',55000,'marketing'),
                (8,'rakesh',100000,'IT'),
                (9,'aditya',65000,'IT'),
                (10,'rahul',50000,'marketing')]
group_by_schema = ['id', 'name', 'salary', 'dept']
df1 = spark.createDataFrame(group_by_data, group_by_schema)
df1.show()
    

+----+-------+----+------+-------+-----------+
|  id|   name| age|   sal|country|       dept|
+----+-------+----+------+-------+-----------+
|   1| manish|  26| 20000|  india|         IT|
|   2|  rahul|null| 40000|germany|engineering|
|   3|  pawan|  12| 60000|  india|      sales|
|   4|roshini|  44|  null|     uk|engineering|
|   5|raushan|  35| 70000|  india|      sales|
|   6|   null|  29|200000|     uk|         IT|
|   7|   adam|  37| 65000|     us|         IT|
|   8|  chris|  16| 40000|     us|      sales|
|null|   null|null|  null|   null|       null|
|   7|   adam|  37| 65000|     us|         IT|
+----+-------+----+------+-------+-----------+

+---+-------+------+---------+
| id|   name|salary|     dept|
+---+-------+------+---------+
|  1| manish| 50000|       IT|
|  2| vikash| 60000|    sales|
|  3|raushan| 70000|marketing|
|  4| mukesh| 80000|       IT|
|  5| pritam| 90000|    sales|
|  6| nikita| 45000|marketing|
|  7| ragini| 55000|marketing|
|  8| rakesh|100000|       IT|


-------------- Find Count On Single And Munltiple Column As Action And Transformer

In [0]:
## use count as an action
# get count on entire columns of table
cnt = df.count()
print("action count is :", cnt)

## use count as transformers
# get count on single columns which will ignore the null value in count
cnt = df.select(count("name")).show()
print("transformation count on one column is :", cnt)
# get count on entire column then not ignore null value in count
cnt = df.select(count("*")).show()
print("transformation count on entire column is :", cnt)

action count is : 10
+-----------+
|count(name)|
+-----------+
|          8|
+-----------+

transformation count on one column is : None
+--------+
|count(1)|
+--------+
|      10|
+--------+

transformation count on entire column is : None


--------------- Use Aggregation Function Like SUM, MIN, MAX, AVG, COUNT

In [0]:
# get sum, min, max
df.select(sum("sal").alias("sal_sum"), min("sal").alias("min_sal"), max("sal").alias("max_sal")).show()

# get the sum, count, avg of salary column
df.select(sum("sal").alias("sal_sum"), count("sal").alias("sal_count"), avg("sal").alias("sal_average").cast('int')).show()

+-------+-------+-------+
|sal_sum|min_sal|max_sal|
+-------+-------+-------+
| 560000|  20000| 200000|
+-------+-------+-------+

+-------+---------+-----------+
|sal_sum|sal_count|sal_average|
+-------+---------+-----------+
| 560000|        8|      70000|
+-------+---------+-----------+



--------------- Work With Group By Statement

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *


# find the total salary given to employe..
df1.select(sum('salary').alias("total salary given to employees")).show()

# find the total salary per department wise
df1.groupBy('dept').agg(sum("salary").alias("total_salary")).show()

## find the total salary per department wise with each column records using join or window function
# using joins
df2 = df1.groupBy('dept').agg(sum("salary").alias("total_salary"))
df2.alias('group_df1').join(
  df1.alias('actual_df1'),
  (col('group_df1.dept') == col('actual_df1.dept'))
).select(
  col('actual_df1.id'),
  col('actual_df1.name'),
  col('actual_df1.salary'),
  col('actual_df1.dept'),
  col('group_df1.total_salary')
).show()

# using window function.
# df2 = df1.alias('df2')
# df2.select(col('id'), col('name'), col('salary'), col('dept'),
#           sum(col('salary')).over(Window.partitionBy(col('dept'))).alias('total_salary')).show()

df1.select(col('id'), col('name'), col('salary'), col('dept')).withColumn(
  'total_salary', sum(col('salary')).over(Window.partitionBy(col('dept')))).show()

+-------------------------------+
|total salary given to employees|
+-------------------------------+
|                         665000|
+-------------------------------+

+---------+------------+
|     dept|total_salary|
+---------+------------+
|       IT|      295000|
|    sales|      150000|
|marketing|      220000|
+---------+------------+

+---+-------+------+---------+------------+
| id|   name|salary|     dept|total_salary|
+---+-------+------+---------+------------+
|  1| manish| 50000|       IT|      295000|
|  2| vikash| 60000|    sales|      150000|
|  3|raushan| 70000|marketing|      220000|
|  5| pritam| 90000|    sales|      150000|
|  4| mukesh| 80000|       IT|      295000|
|  6| nikita| 45000|marketing|      220000|
|  7| ragini| 55000|marketing|      220000|
|  8| rakesh|100000|       IT|      295000|
| 10|  rahul| 50000|marketing|      220000|
|  9| aditya| 65000|       IT|      295000|
+---+-------+------+---------+------------+

+---+-------+------+---------+------