# Apache Spark Demo

In [1]:
# Apache Spark uses Java, so first we must install that

#!apt-get install openjdk-8-jdk-headless -qq > /dev/null

The system cannot find the path specified.


In [2]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google'

In [5]:
# Download and unpack the latest version of Spark (3.3.0 as of writing)
#!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xzf spark-3.3.0-bin-hadoop3.tgz

In [7]:
# Set up environment variables
import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.3.0-bin-hadoop3"

In [8]:
# Install findspark, which helps python locate the psyspark module files
!pip install -q findspark
import findspark
findspark.init()

In [None]:
# Finally, we initialse a "SparkSession", which handles the computations
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
# Load in the .csv file to a DataFrame
usersCsvPath = "/content/sample_data/california_housing_test.csv"

housingDF = (spark
             .read
             .option('header', True)
             .option('inferSchema', True)
             .csv(usersCsvPath))

In [None]:
housingDF.show()
housingDF.printSchema()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|    

In [None]:
housingDF.select('median_house_value').show()

+------------------+
|median_house_value|
+------------------+
|          344700.0|
|          176500.0|
|          270500.0|
|          330000.0|
|           81700.0|
|           67000.0|
|           67000.0|
|          166900.0|
|          194400.0|
|          164200.0|
|          125000.0|
|           58300.0|
|          252600.0|
|          231200.0|
|          222500.0|
|          153100.0|
|          181300.0|
|          137500.0|
|          300000.0|
|          414300.0|
+------------------+
only showing top 20 rows



In [None]:
housingDF.select('median_house_value', 'longitude', 'latitude').show()

+------------------+---------+--------+
|median_house_value|longitude|latitude|
+------------------+---------+--------+
|          344700.0|  -122.05|   37.37|
|          176500.0|   -118.3|   34.26|
|          270500.0|  -117.81|   33.78|
|          330000.0|  -118.36|   33.82|
|           81700.0|  -119.67|   36.33|
|           67000.0|  -119.56|   36.51|
|           67000.0|  -121.43|   38.63|
|          166900.0|  -120.65|   35.48|
|          194400.0|  -122.84|    38.4|
|          164200.0|  -118.02|   34.08|
|          125000.0|  -118.24|   33.98|
|           58300.0|  -119.12|   35.85|
|          252600.0|  -121.93|   37.25|
|          231200.0|  -117.03|   32.97|
|          222500.0|  -117.97|   33.73|
|          153100.0|  -117.99|   33.81|
|          181300.0|  -120.81|   37.53|
|          137500.0|   -121.2|   38.69|
|          300000.0|  -118.88|   34.21|
|          414300.0|  -122.59|   38.01|
+------------------+---------+--------+
only showing top 20 rows



In [None]:
housingDF.select('median_house_value', 'longitude', 'latitude', 'housing_median_age')\
         .filter('housing_median_age > 30')\
         .show()

+------------------+---------+--------+------------------+
|median_house_value|longitude|latitude|housing_median_age|
+------------------+---------+--------+------------------+
|          176500.0|   -118.3|   34.26|              43.0|
|           67000.0|  -119.56|   36.51|              37.0|
|           67000.0|  -121.43|   38.63|              43.0|
|          164200.0|  -118.02|   34.08|              31.0|
|          125000.0|  -118.24|   33.98|              45.0|
|           58300.0|  -119.12|   35.85|              37.0|
|          252600.0|  -121.93|   37.25|              36.0|
|          153100.0|  -117.99|   33.81|              42.0|
|          414300.0|  -122.59|   38.01|              35.0|
|          126300.0|  -122.15|   37.75|              40.0|
|           83400.0|  -121.37|   38.68|              36.0|
|          241500.0|  -118.16|   34.07|              47.0|
|          115400.0|   -122.2|   37.79|              45.0|
|          484700.0|  -118.03|   34.16|              36.

In [None]:
old_houses = housingDF.select('median_house_value', 'longitude', 'latitude', 'housing_median_age')\
         .filter('housing_median_age > 30')
old_houses.show()

+------------------+---------+--------+------------------+
|median_house_value|longitude|latitude|housing_median_age|
+------------------+---------+--------+------------------+
|          176500.0|   -118.3|   34.26|              43.0|
|           67000.0|  -119.56|   36.51|              37.0|
|           67000.0|  -121.43|   38.63|              43.0|
|          164200.0|  -118.02|   34.08|              31.0|
|          125000.0|  -118.24|   33.98|              45.0|
|           58300.0|  -119.12|   35.85|              37.0|
|          252600.0|  -121.93|   37.25|              36.0|
|          153100.0|  -117.99|   33.81|              42.0|
|          414300.0|  -122.59|   38.01|              35.0|
|          126300.0|  -122.15|   37.75|              40.0|
|           83400.0|  -121.37|   38.68|              36.0|
|          241500.0|  -118.16|   34.07|              47.0|
|          115400.0|   -122.2|   37.79|              45.0|
|          484700.0|  -118.03|   34.16|              36.

# Lecture 3 - Aggregations

In [None]:
usersCsvPath = "/content/titanic_train.csv"

titanicDF = (spark
             .read
             .option('header', True)
             .option('inferSchema', True)
             .csv(usersCsvPath))

In [None]:
# Check the contents of the .csv file

titanicDF.show() # .show displays the first 20 rows by default
titanicDF.printSchema()

+--------+------+--------------------+------+---+-----------------------+-----------------------+-------+
|Survived|Pclass|                Name|   Sex|Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|
+--------+------+--------------------+------+---+-----------------------+-----------------------+-------+
|       0|     3|Mr. Owen Harris B...|  male| 22|                      1|                      0|   7.25|
|       1|     1|Mrs. John Bradley...|female| 38|                      1|                      0|71.2833|
|       1|     3|Miss. Laina Heikk...|female|  ?|                      0|                      0|  7.925|
|       1|     1|Mrs. Jacques Heat...|female| 35|                      1|                      0|   53.1|
|       0|     3|Mr. William Henry...|  male| 35|                      0|                      0|   8.05|
|       0|     3|     Mr. James Moran|  male| 27|                      0|                      0| 8.4583|
|       0|     1|Mr. Timothy J McC...|  male| 

In [None]:
titanicDF.groupBy('Pclass').mean('Fare').show()

+------+------------------+
|Pclass|         avg(Fare)|
+------+------------------+
|     1| 84.15468749999992|
|     3|13.707707392197129|
|     2| 20.66218315217391|
+------+------------------+



In [None]:
titanicDF.groupBy('sex').mean('Fare').show()

+------+------------------+
|   sex|         avg(Fare)|
+------+------------------+
|female| 44.47981783439487|
|  male|25.633935253054084|
+------+------------------+



In [None]:
titanicDF.groupBy('Pclass').mean('Fare').sort('Pclass').show()
titanicDF.groupBy('Pclass', 'sex').count().sort('Pclass').show()

+------+------------------+
|Pclass|         avg(Fare)|
+------+------------------+
|     1| 84.15468749999992|
|     2| 20.66218315217391|
|     3|13.707707392197129|
+------+------------------+

+------+------+-----+
|Pclass|   sex|count|
+------+------+-----+
|     1|female|   94|
|     1|  male|  122|
|     2|female|   76|
|     2|  male|  108|
|     3|  male|  343|
|     3|female|  144|
+------+------+-----+



In [None]:
from pyspark.sql import functions as F

In [None]:
titanicDF.groupBy('sex', 'Pclass')\
         .agg(F.mean('Siblings/Spouses Aboard'),
              F.mean('Parents/Children Aboard'))\
         .sort('Pclass', 'sex')\
         .show()

+------+------+----------------------------+----------------------------+
|   sex|Pclass|avg(Siblings/Spouses Aboard)|avg(Parents/Children Aboard)|
+------+------+----------------------------+----------------------------+
|female|     1|          0.5591397849462365|         0.46236559139784944|
|  male|     1|          0.3140495867768595|          0.2786885245901639|
|female|     2|                         0.5|          0.6081081081081081|
|  male|     2|          0.3592233009708738|         0.22549019607843138|
|female|     3|          0.9020979020979021|          0.8041958041958042|
|  male|     3|          0.5103244837758112|         0.22807017543859648|
+------+------+----------------------------+----------------------------+



In [None]:
sibsp = F.col('Siblings/Spouses Aboard')
parch = F.col('Parents/Children Aboard')

In [None]:
SibSpParCh = (sibsp + parch).alias('SibSpParCh')

In [None]:
titanicDF.groupBy('sex', 'Pclass')\
         .agg(F.mean(SibSpParCh).alias('Mean_SibSpParCh'))\
         .sort('Pclass', 'sex')\
         .show()

+------+------+------------------+
|   sex|Pclass|   Mean_SibSpParCh|
+------+------+------------------+
|female|     1| 1.021505376344086|
|  male|     1|0.5950413223140496|
|female|     2|1.1111111111111112|
|  male|     2| 0.594059405940594|
|female|     3|1.7062937062937062|
|  male|     3| 0.742603550295858|
+------+------+------------------+

