## **Aggregating Dataframes in PySpark**

In this lecture we will be going over how to aggregate dataframes in Pyspark. The commands we will be learning here will be super useful for doing quality checks on your dataframes and answering more simplistic business questions with your data.

So let's get to it! Here is what we will cover today:
- GroupBy;
- Pivot;
- Aggregate methods;
- Combos of each.

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('AggregateDataset').getOrCreate()

22/03/11 10:45:44 WARN Utils: Your hostname, DESKTOP-MR1I23T resolves to a loopback address: 127.0.1.1; using 172.18.70.248 instead (on interface eth0)
22/03/11 10:45:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/11 10:45:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [4]:
airbnb = spark.read.csv('data/nyc_air_bnb.csv', header=True, inferSchema=True)
print(airbnb.printSchema())

airbnb.limit(5).toPandas()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)

None


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

df_fixed = airbnb \
            .withColumn('id', airbnb['id'].cast(IntegerType())) \
            .withColumn('host_id', airbnb['host_id'].cast(IntegerType())) \
            .withColumn('latitude', airbnb['latitude'].cast(FloatType())) \
            .withColumn('longitude', airbnb['longitude'].cast(FloatType())) \
            .withColumn('price', airbnb['price'].cast(FloatType())) \
            .withColumn('minimum_nights', airbnb['minimum_nights'].cast(IntegerType())) \
            .withColumn('number_of_reviews', airbnb['number_of_reviews'].cast(IntegerType())) \
            .withColumn('last_review', to_date(airbnb['last_review'], 'yyyy-MM-dd')) \
            .withColumn('reviews_per_month', airbnb['reviews_per_month'].cast(FloatType())) \
            .withColumn('calculated_host_listings_count', airbnb['calculated_host_listings_count'].cast(IntegerType()))

df_fixed.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: float (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: date (nullable = true)
 |-- reviews_per_month: float (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)



In [6]:
df_fixed.limit(5).toPandas()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.647491,-73.972366,Private room,149.0,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.983772,Entire home/apt,225.0,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.809021,-73.941902,Private room,150.0,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.685139,-73.959763,Entire home/apt,89.0,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.798512,-73.943993,Entire home/apt,80.0,10,9,2018-11-19,0.1,1,0


In [7]:
df_fixed.groupBy('neighbourhood_group').count().show(7, False)

+-------------------+-----+
|neighbourhood_group|count|
+-------------------+-----+
|Douglaston         |1    |
|Queens             |5630 |
|Nadia              |1    |
|Midtown            |4    |
|Hell's Kitchen     |7    |
|Greenwich Village  |2    |
|Clinton Hill       |1    |
+-------------------+-----+
only showing top 7 rows



In [9]:
df_fixed.groupBy('neighbourhood_group').min('price').show(7)

+-------------------+----------+
|neighbourhood_group|min(price)|
+-------------------+----------+
|         Douglaston|       1.0|
|             Queens|      10.0|
|              Nadia|      null|
|            Midtown|       2.0|
|     Hell's Kitchen|       1.0|
|  Greenwich Village|      31.0|
|       Clinton Hill|      14.0|
+-------------------+----------+
only showing top 7 rows



In [12]:
df_fixed.groupBy('neighbourhood') \
    .agg({'price': 'mean'}).show()

+------------------+------------------+
|     neighbourhood|        avg(price)|
+------------------+------------------+
|            Corona|         59.171875|
|      Richmondtown|              78.0|
|      Prince's Bay|             409.5|
|       Westerleigh|              71.5|
|        Mill Basin|            179.75|
|      Civic Center|191.94230769230768|
|          40.83166|               1.0|
|        Douglaston| 88.14285714285714|
|        Mount Hope|              77.5|
|       Marble Hill| 89.16666666666667|
|         Rego Park| 83.87735849056604|
|          40.81225|               2.0|
|     Dyker Heights| 93.41666666666667|
| Kew Gardens Hills| 112.3076923076923|
|      Dongan Hills| 79.42857142857143|
|          40.81078|               3.0|
|Financial District|225.49059139784947|
|       Bay Terrace|             142.0|
|          40.83117|               1.0|
|           Midtown| 282.7839065541856|
+------------------+------------------+
only showing top 20 rows



In [13]:
from pyspark.sql.functions import *

df_fixed.groupBy('neighbourhood').agg(min(df_fixed.price), max(df_fixed.price)).show()

+------------------+----------+----------+
|     neighbourhood|min(price)|max(price)|
+------------------+----------+----------+
|            Corona|      23.0|     359.0|
|      Richmondtown|      78.0|      78.0|
|      Prince's Bay|      85.0|    1250.0|
|       Westerleigh|      40.0|     103.0|
|        Mill Basin|      85.0|     299.0|
|      Civic Center|      50.0|     950.0|
|          40.83166|       1.0|       1.0|
|        Douglaston|      40.0|     178.0|
|        Mount Hope|      24.0|     250.0|
|       Marble Hill|      40.0|     274.0|
|         Rego Park|      21.0|     300.0|
|          40.81225|       2.0|       2.0|
|     Dyker Heights|      30.0|     170.0|
| Kew Gardens Hills|      40.0|     399.0|
|      Dongan Hills|      37.0|     155.0|
|          40.81078|       3.0|       3.0|
|Financial District|      12.0|    3000.0|
|       Bay Terrace|      32.0|     258.0|
|          40.83117|       1.0|       1.0|
|           Midtown|      30.0|    5100.0|
+----------

In [20]:
summary = df_fixed.summary('count', 'min', '25%', '75%', 'max')
summary.toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,min,25%,75%,max
id,48895,2539,9471893,29152899,36487245
name,49047,1 Bed Apt in Utopic Williamsburg,2.4544724E7,1.74786681E8,"ﾏﾝﾊｯﾀﾝ､駅から徒歩4分でどこに行くのにも便利な場所!女性の方希望,ｷﾚｲなお部屋｡"
host_id,48729,2438,7797690,107434423,274321313
host_name,48873,"very clean studio app""",475.0,,현선
neighbourhood_group,48894,194716858,1.94716858E8,1.97400421E8,Woodside
neighbourhood,48894,2,40.68771,40.78304,Woodside
latitude,48885,-74.16254,40.68981,40.76299,40.91306
longitude,48736,-74.24442,-73.98309,-73.93638,2.4906404E7
room_type,48894,-73.90783,56.0,145.0,Shared room


In [19]:
limit_summary = df_fixed.select('price', 'minimum_nights', 'number_of_reviews').summary('count', 'min', '25%', '75%', 'max')
limit_summary.toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,min,25%,75%,max
price,48887,-74.00828,69.0,175.0,10000.0
minimum_nights,48891,0,1,5,1250
number_of_reviews,48738,0,1,23,629


In [21]:
df_fixed.agg(min(df_fixed.price), max(df_fixed.price)).show() 

+----------+----------+
|min(price)|max(price)|
+----------+----------+
| -74.00828|   10000.0|
+----------+----------+



In [24]:
df_fixed.select(countDistinct('neighbourhood_group'), avg('price'), stddev('price')).toPandas()

Unnamed: 0,count(DISTINCT neighbourhood_group),avg(price),stddev_samp(price)
0,77,152.222963,238.541486


In [29]:
df_fixed.filter('room_type=\'Shared room\'').groupBy('room_type').pivot('neighbourhood_group', ['Queens', 'Astoria', 'Brooklyn']).count().show()

+-----------+------+-------+--------+
|  room_type|Queens|Astoria|Brooklyn|
+-----------+------+-------+--------+
|Shared room|   198|   null|     413|
+-----------+------+-------+--------+



In [28]:
df_fixed.filter('room_type=\'Shared room\'').groupBy('room_type').pivot('neighbourhood_group', ['Queens', 'Astoria', 'Brooklyn']).agg(min(df_fixed.price), max(df_fixed.price)).show()

+-----------+-----------------+-----------------+------------------+------------------+-------------------+-------------------+
|  room_type|Queens_min(price)|Queens_max(price)|Astoria_min(price)|Astoria_max(price)|Brooklyn_min(price)|Brooklyn_max(price)|
+-----------+-----------------+-----------------+------------------+------------------+-------------------+-------------------+
|Shared room|             11.0|           1800.0|              null|              null|                0.0|              725.0|
+-----------+-----------------+-----------------+------------------+------------------+-------------------+-------------------+

