# 0. **Install PySpark**

In [12]:
!pip install pyspark



# 1. **Imports**:

In [13]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct, collect_list, collect_set
from pyspark.sql.functions import sum, avg, max, countDistinct, count
from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
from pyspark.sql.functions import variance, var_samp, var_pop

   - Various functions from `pyspark.sql.functions` are imported to perform different aggregate operations.


# 2. **Initialize Spark session**:


In [14]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

   - Initializes a Spark session.

# 3. **Sample data and schema**:


In [15]:
simpleData = simpleData = [("James", "Sales", 3000),
                            ("Michael", "Sales", 4600),
                            ("Robert", "Sales", 4100),
                            ("Maria", "Finance", 3000),
                            ("James", "Sales", 3000),
                            ("Scott", "Finance", 3300),
                            ("Jen", "Finance", 3900),
                            ("Jeff", "Marketing", 3000),
                            ("Kumar", "Marketing", 2000),
                            ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]

   - Defines sample data and schema.


# 4. **Create DataFrame**:


In [8]:
df = spark.createDataFrame(data=simpleData, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



   - Creates a DataFrame from the sample data and displays its schema and content.


# 5. **Approximate Count Distinct**:


In [9]:
approx_distinct_count = df.select(approx_count_distinct("salary")).collect()[0][0]

print("approx_count_distinct: " + str(approx_distinct_count))

approx_count_distinct: 6


   - Computes an approximate count of distinct values in the `salary` column.


# 6. **Average**:


In [10]:
average_salary = df.select(avg("salary")).collect()[0][0]
print("avg: " + str(average_salary))

avg: 3400.0


   - Calculates the average of the `salary` column.


# 7. **Collect List**:


In [30]:
df.select(collect_list("salary")).show(truncate=False)

+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+



   - Collects all values of the `salary` column into a list.


# 8. **Collect Set**:


In [16]:
df.select(collect_set("salary")).show(truncate=False)

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



   - Collects unique values of the `salary` column into a set.


# 9. **Count Distinct with Multiple Columns**:


In [17]:
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
distinct_count = df2.collect()[0][0]
print("Distinct Count of Department & Salary: " + str(distinct_count))

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

Distinct Count of Department & Salary: 8


   - Counts distinct combinations of `department` and `salary` columns.


# 10. **Count**:


In [18]:
salary_count = df.select(count("salary")).collect()[0][0]
print("count: " + str(salary_count))

count: 10


- Counts the number of non-null values in the `salary` column.


# 11. **First and Last Value**:


In [19]:
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|3000         |
+-------------+

+------------+
|last(salary)|
+------------+
|4100        |
+------------+



- Retrieves the first and last values of the `salary` column.


# 12. **Kurtosis**:


In [20]:
df.select(kurtosis("salary")).show(truncate=False)

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6467803030303032|
+-------------------+



- Computes the kurtosis of the `salary` column.


# 13. **Max and Min Values**:


In [21]:
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)

+-----------+
|max(salary)|
+-----------+
|4600       |
+-----------+

+-----------+
|min(salary)|
+-----------+
|2000       |
+-----------+



- Retrieves the maximum and minimum values of the `salary` column.


# 14. **Mean Value**:


In [22]:
df.select(mean("salary")).show(truncate=False)

+-----------+
|avg(salary)|
+-----------+
|3400.0     |
+-----------+



- Computes the mean of the `salary` column.

# 15. **Skewness**:


In [23]:
df.select(skewness("salary")).show(truncate=False)

+--------------------+
|skewness(salary)    |
+--------------------+
|-0.12041791181069571|
+--------------------+



- Computes the skewness of the `salary` column.


# 16. **Standard Deviation**:


In [24]:
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate=False)

+-----------------+-------------------+------------------+
|stddev(salary)   |stddev_samp(salary)|stddev_pop(salary)|
+-----------------+-------------------+------------------+
|765.9416862050705|765.9416862050705  |726.636084983398  |
+-----------------+-------------------+------------------+



- Computes the standard deviation, sample standard deviation, and population standard deviation of the `salary` column.


# 17. **Sum and Sum Distinct**:


In [25]:
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)

+-----------+
|sum(salary)|
+-----------+
|34000      |
+-----------+





+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|20900               |
+--------------------+



- Computes the sum and sum of distinct values of the `salary` column.


# 18. **Variance**:


In [26]:
df.select(variance("salary"), var_samp("salary"), var_pop("salary")).show(truncate=False)

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+



- Computes the variance, sample variance, and population variance of the `salary` column.
