# DataFrame Operations

In [6]:
import os
os.environ['SPARK_HOME'] = r"C:\Users\Dani\Documents\Python Scripts\Spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'
os.environ["HADOOP_HOME"] = r"C:\hadoop\hadoop-3.2.2"
os.environ["PATH"] += r";C:\hadoop\hadoop-3.2.2\bin"

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [8]:
spark = SparkSession.builder.appName("DataFrame-Operations").getOrCreate()

In [9]:
parquet_path = "data/house-price.parquet"
df = spark.read.parquet(parquet_path)
df.printSchema()
df.show(5)

root
 |-- price: long (nullable = true)
 |-- area: long (nullable = true)
 |-- bedrooms: long (nullable = true)
 |-- bathrooms: long (nullable = true)
 |-- stories: long (nullable = true)
 |-- mainroad: string (nullable = true)
 |-- guestroom: string (nullable = true)
 |-- basement: string (nullable = true)
 |-- hotwaterheating: string (nullable = true)
 |-- airconditioning: string (nullable = true)
 |-- parking: long (nullable = true)
 |-- prefarea: string (nullable = true)
 |-- furnishingstatus: string (nullable = true)

+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000|7420|       4|        2|      3|     yes|       no

## Choose Specific Column

In [12]:
selected_cols = df.select("price", "area", "bedrooms", "bathrooms")
selected_cols.show(5)

+--------+----+--------+---------+
|   price|area|bedrooms|bathrooms|
+--------+----+--------+---------+
|13300000|7420|       4|        2|
|12250000|8960|       4|        4|
|12250000|9960|       3|        2|
|12215000|7500|       4|        2|
|11410000|7420|       4|        1|
+--------+----+--------+---------+
only showing top 5 rows



## Filter on any condition

In [15]:
filtered_df = selected_cols.filter((selected_cols.bedrooms > 3) & (selected_cols.bathrooms < 2))
filtered_df.show(5)

+--------+-----+--------+---------+
|   price| area|bedrooms|bathrooms|
+--------+-----+--------+---------+
|11410000| 7420|       4|        1|
| 9870000| 8100|       4|        1|
| 9100000| 6000|       4|        1|
| 8120000| 6840|       5|        1|
| 7343000|11440|       4|        1|
+--------+-----+--------+---------+
only showing top 5 rows



In [17]:
check = filtered_df.agg({"bedrooms": "min", "bathrooms":"max"})
check.show()

+-------------+--------------+
|min(bedrooms)|max(bathrooms)|
+-------------+--------------+
|            4|             1|
+-------------+--------------+



## GroupBy

In [22]:
df.show()

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000| 7420|       4|        1

In [40]:
grouped_data = df.groupBy("area").agg(
    F.mean("price").alias("mean_price"),
    F.median("price").alias("median_price"),
    F.stddev("price").alias("stddev_price"),
    F.mean("bathrooms").alias("mean_bathrooms"),
    F.count(F.when(F.col("basement") == "yes", F.col("area"))).alias("houses_with_basement"),
    F.count(F.when(F.col("guestroom") == "yes", F.col("area"))).alias("houses_with_guestroom"),
    F.sum(F.col("price")).alias("total_price")).orderBy(F.desc("houses_with_guestroom"), F.asc("mean_price"))

grouped_data.show()

+----+------------------+------------+------------------+------------------+--------------------+---------------------+-----------+
|area|        mean_price|median_price|      stddev_price|    mean_bathrooms|houses_with_basement|houses_with_guestroom|total_price|
+----+------------------+------------+------------------+------------------+--------------------+---------------------+-----------+
|6000| 7051479.166666667|   7280000.0|1362431.8848483576|1.7083333333333333|                   5|                    9|  169235500|
|5500| 5762555.555555556|   5873000.0| 1307983.190938545|1.5555555555555556|                   4|                    4|   51863000|
|6600| 6443111.111111111|   6230000.0|1227408.1884650725|1.6666666666666667|                   7|                    4|   57988000|
|4800|         5742800.0|   5810000.0|495583.19180537184|               1.4|                   3|                    3|   28714000|
|6900|         5771500.0|   5771500.0|351432.07024971413|               1.5|

## Join multiple dataframes

In [43]:
df2 = df.select("area", "price", "bathrooms", "guestroom")
joined_data = df.join(df2, ["area", "price"], "inner")
print("Joined Data:")
joined_data.show()

Joined Data:
+-----+--------+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+---------+---------+
| area|   price|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|bathrooms|guestroom|
+-----+--------+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+---------+---------+
| 7420|13300000|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|        2|       no|
| 8960|12250000|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|        4|       no|
| 9960|12250000|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|        2|       no|
| 7500|12215000|       4|        2|    

## Get unique rows

In [44]:
distinct_areas = df.select("area").distinct()
distinct_areas.show()

+-----+
| area|
+-----+
| 1950|
| 3800|
| 5850|
| 2520|
| 3120|
| 7424|
| 3069|
| 7800|
| 3640|
| 3680|
| 4300|
| 2175|
| 2400|
|10240|
| 6862|
| 1905|
| 3930|
|13200|
| 3240|
| 5400|
+-----+
only showing top 20 rows



## Drop Columns

In [47]:
joined_data = joined_data.drop("bathrooms", "guestroom")
joined_data.show(5)

+----+--------+--------+-------+--------+--------+---------------+---------------+-------+--------+----------------+
|area|   price|bedrooms|stories|mainroad|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+----+--------+--------+-------+--------+--------+---------------+---------------+-------+--------+----------------+
|7420|13300000|       4|      3|     yes|      no|             no|            yes|      2|     yes|       furnished|
|8960|12250000|       4|      4|     yes|      no|             no|            yes|      3|      no|       furnished|
|9960|12250000|       3|      2|     yes|     yes|             no|             no|      2|     yes|  semi-furnished|
|7500|12215000|       4|      2|     yes|     yes|             no|            yes|      3|     yes|       furnished|
|7420|11410000|       4|      2|     yes|     yes|             no|            yes|      2|      no|       furnished|
+----+--------+--------+-------+--------+--------+--------------

## WithColumn: Add new calculated columns.


In [48]:
df.show(3)

+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000|7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000|8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000|9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
only showing top 3 rows



In [49]:
df_with_new_column = df.withColumn("total_places", df.bedrooms + df.bathrooms + df.stories)
print("DataFrame with New Column:")
df_with_new_column.show(10)

DataFrame with New Column:
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|total_places|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|           9|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|          12|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|           7|
|12215000| 7500|       4|        2|      2|     yes|       no|     

In [50]:
spark.stop()