## Aggregation Operations on a DataFrame

In [1]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("EDA-Aggregated").getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/26 11:19:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# create a schema

col_age = StructField("age", IntegerType(), True)
col_workclass = StructField("workclass", StringType(), True)
col_fnlwgt = StructField("fnlwgt", DoubleType(), True)
col_education = StructField("education", StringType(), True)
col_education_num = StructField("education_num", DoubleType(), True)
col_marital_status = StructField("marital_status", StringType(), True)
col_occupation = StructField("occupation", StringType(), True)
col_relationship = StructField("relationship", StringType(), True)
col_race = StructField("race", StringType(), True)
col_sex = StructField("sex", StringType(), True)
col_capital_gain = StructField("capital_gain", DoubleType(), True)
col_capital_loss = StructField("capital_loss", DoubleType(), True)
col_hours_per_week = StructField("hours_per_week", DoubleType(), True)
col_native_country = StructField("native_country", StringType(), True)
col_income = StructField("income", StringType(), True)

# column list
df_cols_ = [col_age, col_workclass, col_fnlwgt, col_education, col_education_num, col_marital_status, \
    col_occupation, col_relationship, col_race, col_sex, col_capital_gain, col_capital_loss, \
        col_hours_per_week, col_native_country, col_income]

# define schema
df_schema = StructType(df_cols_)

In [3]:
census_df = spark.read.csv(
    "file:////home/ashru/Downloads/adult.data", 
    header=False,
    schema=df_schema
)

census_df.show()

+---+-----------------+--------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|  fnlwgt|    education|education_num|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+-----------------+--------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516.0|    Bachelors|         13.0|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|      2174.0|         0.0|          40.0| United-States| <=50K|
| 50| Self-emp-not-inc| 83311.0|    Bachelors|         13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|         0.0|         0.0|   

                                                                                

In [4]:
# DataFrame Schema
census_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: double (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [4]:
# Number of data points

census_df.count()

                                                                                

32560

In [5]:
# frequency of salaries greater than and less than 50K
census_df.groupBy('income').count().show()

[Stage 1:>                                                          (0 + 1) / 1]

+------+-----+
|income|count|
+------+-----+
|  >50K| 7841|
| <=50K|24720|
+------+-----+



                                                                                

In [6]:
# summary statistics of age column
census_df.describe('age').show()

                                                                                

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|             32561|
|   mean| 38.58164675532078|
| stddev|13.640432553581356|
|    min|                17|
|    max|                90|
+-------+------------------+



In [7]:
# summary statistics of capital gain
census_df.describe('capital_gain').show()

[Stage 7:>                                                          (0 + 1) / 1]

+-------+------------------+
|summary|      capital_gain|
+-------+------------------+
|  count|             32561|
|   mean|1077.6488437087312|
| stddev| 7385.292084840354|
|    min|               0.0|
|    max|           99999.0|
+-------+------------------+



                                                                                

In [8]:
# summary statistics of capital loss
census_df.describe('capital_loss').show()

+-------+----------------+
|summary|    capital_loss|
+-------+----------------+
|  count|           32561|
|   mean| 87.303829734959|
| stddev|402.960218649002|
|    min|             0.0|
|    max|          4356.0|
+-------+----------------+



In [9]:
# summary statistics of hours_per_week
census_df.describe('hours_per_week').show()

+-------+------------------+
|summary|    hours_per_week|
+-------+------------------+
|  count|             32561|
|   mean|40.437455852092995|
| stddev|12.347428681731838|
|    min|               1.0|
|    max|              99.0|
+-------+------------------+



In [10]:
# Average age of male and female workers
census_df.groupBy('sex').mean('age').show()

[Stage 16:>                                                         (0 + 1) / 1]

+-------+-----------------+
|    sex|         avg(age)|
+-------+-----------------+
|   Male|39.43354749885268|
| Female|36.85823043357163|
+-------+-----------------+



                                                                                

In [11]:
# Average works hours per week for male and females
census_df.groupBy('sex').mean('hours_per_week').show()

[Stage 19:>                                                         (0 + 1) / 1]

+-------+-------------------+
|    sex|avg(hours_per_week)|
+-------+-------------------+
|   Male|  42.42808627810923|
| Female| 36.410361154953115|
+-------+-------------------+



                                                                                

In [12]:
# Number of male and females from different salary groups
census_df.groupBy(['sex', 'income']).count().show()

+-------+------+-----+
|    sex|income|count|
+-------+------+-----+
|   Male| <=50K|15128|
|   Male|  >50K| 6662|
| Female|  >50K| 1179|
| Female| <=50K| 9592|
+-------+------+-----+



In [15]:
# Highest paid jobs

census_df.groupBy(['occupation', 'income']).count().sort(['income', 'count'], ascending=False).show(5)

+----------------+------+-----+
|      occupation|income|count|
+----------------+------+-----+
| Exec-managerial|  >50K| 1968|
|  Prof-specialty|  >50K| 1859|
|           Sales|  >50K|  983|
|    Craft-repair|  >50K|  929|
|    Adm-clerical|  >50K|  507|
+----------------+------+-----+
only showing top 5 rows



### Execute SQL and HiveQL Queries on a DataFrame

**`createOrReplaceTempView()`**

- DataFrame class provides the `createOrReplaceTempView()`

- Creates a temporary view.

- Life of this view is same as the `SparkSession` that creates the DF.

- In older pyspark versions there was `registerTempTable()` which creates a temporary table in memory.

In [4]:
# create a temporary view in memory
census_df.createOrReplaceTempView("census_tbl")

In [6]:
spark.sql("select age, income from census_tbl limit 5").show()

+---+------+
|age|income|
+---+------+
| 39| <=50K|
| 50| <=50K|
| 38| <=50K|
| 53| <=50K|
| 28| <=50K|
+---+------+



In [9]:
# Avg hours per week based on education level

spark.sql( 
    """
    SELECT 
        education, 
        round(avg(hours_per_week),2) as average_hours_per_week
    FROM
        census_tbl
    GROUP BY education
    ORDER BY average_hours_per_week DESC
    """
).show()

+-------------+----------------------+
|    education|average_hours_per_week|
+-------------+----------------------+
|  Prof-school|                 47.43|
|    Doctorate|                 46.97|
|      Masters|                 43.84|
|    Bachelors|                 42.61|
|    Assoc-voc|                 41.61|
|      HS-grad|                 40.58|
|   Assoc-acdm|                  40.5|
|      7th-8th|                 39.37|
|      5th-6th|                  38.9|
| Some-college|                 38.85|
|      1st-4th|                 38.26|
|          9th|                 38.04|
|         10th|                 37.05|
|    Preschool|                 36.65|
|         12th|                 35.78|
|         11th|                 33.93|
+-------------+----------------------+

