In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
!ls

sample_data  spark-3.1.1-bin-hadoop3.2	spark-3.1.1-bin-hadoop3.2.tgz


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [None]:
df = spark.read.format("csv").option("delimiter", "  ").load("housing.csv")
df.count()
from pyspark.sql.functions import split
df = df.select(df._c0.alias("crim"),
               df._c1.alias("zn"),
               df._c2.alias("indus"),
               df._c3.alias("chas"),
               df._c4.alias("nox"),
               df._c5.alias("rm"),
               df._c6.alias("age"),
               df._c7.alias("dis"),
               df._c8.alias("rad"),
               df._c9.alias("tax"),
               split(df._c10, " ").getItem(0).alias("ptratio"),
               split(df._c10, " ").getItem(1).alias("black"),
               df._c11.alias("lstat"),
               df._c12.alias("medv"))
print(df.count()) # 506

df = df[df["medv"] < 50]
# print(df.count()) # 450

df.show(5)

506
+--------+-----+------+----+------+------+-----+------+---+-----+-------+------+-----+-----+
|    crim|   zn| indus|chas|   nox|    rm|  age|   dis|rad|  tax|ptratio| black|lstat| medv|
+--------+-----+------+----+------+------+-----+------+---+-----+-------+------+-----+-----+
| 0.00632|18.00| 2.310|   0|0.5380|6.5750|65.20|4.0900|  1|296.0|  15.30|396.90| 4.98|24.00|
| 0.02731| 0.00| 7.070|   0|0.4690|6.4210|78.90|4.9671|  2|242.0|  17.80|396.90| 9.14|21.60|
| 0.02729| 0.00| 7.070|   0|0.4690|7.1850|61.10|4.9671|  2|242.0|  17.80|392.83| 4.03|34.70|
| 0.03237| 0.00| 2.180|   0|0.4580|6.9980|45.80|6.0622|  3|222.0|  18.70|394.63| 2.94|33.40|
| 0.06905| 0.00| 2.180|   0|0.4580|7.1470|54.20|6.0622|  3|222.0|  18.70|396.90| 5.33|36.20|
+--------+-----+------+----+------+------+-----+------+---+-----+-------+------+-----+-----+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col

# Check the data types of the columns
df.printSchema()

# Cast the medv column from string to double type
df = df.withColumn("medv", col("medv").cast("double"))

# Check the data types of the columns again
df.printSchema()

root
 |-- crim: string (nullable = true)
 |-- zn: string (nullable = true)
 |-- indus: string (nullable = true)
 |-- chas: string (nullable = true)
 |-- nox: string (nullable = true)
 |-- rm: string (nullable = true)
 |-- age: string (nullable = true)
 |-- dis: string (nullable = true)
 |-- rad: string (nullable = true)
 |-- tax: string (nullable = true)
 |-- ptratio: string (nullable = true)
 |-- black: string (nullable = true)
 |-- lstat: string (nullable = true)
 |-- medv: string (nullable = true)

root
 |-- crim: string (nullable = true)
 |-- zn: string (nullable = true)
 |-- indus: string (nullable = true)
 |-- chas: string (nullable = true)
 |-- nox: string (nullable = true)
 |-- rm: string (nullable = true)
 |-- age: string (nullable = true)
 |-- dis: string (nullable = true)
 |-- rad: string (nullable = true)
 |-- tax: string (nullable = true)
 |-- ptratio: string (nullable = true)
 |-- black: string (nullable = true)
 |-- lstat: string (nullable = true)
 |-- medv: double (null

In [None]:
from pyspark.ml.feature import Bucketizer
from pyspark.sql.functions import col
import numpy as np

# Define the number of bins
num_bins = 10

# Compute the histogram bin edges using np.histogram
hist, outcome_bins = np.histogram(df.select("medv").rdd.flatMap(lambda x: x).collect(), bins=num_bins)

print("Bin Edges: ", outcome_bins)

# Create the Bucketizer transformer with the updated splits
bucketizer = Bucketizer(splits=outcome_bins, inputCol="medv", outputCol="medv_bin")

# Apply the Bucketizer to the data
df = bucketizer.transform(df)

df.show()

Bin Edges:  [ 6.3  10.55 14.8  19.05 23.3  27.55 31.8  36.05 40.3  44.55 48.8 ]
+--------+-----+------+----+------+------+-----+------+---+-----+-------+------+-----+----+--------+
|    crim|   zn| indus|chas|   nox|    rm|  age|   dis|rad|  tax|ptratio| black|lstat|medv|medv_bin|
+--------+-----+------+----+------+------+-----+------+---+-----+-------+------+-----+----+--------+
| 0.00632|18.00| 2.310|   0|0.5380|6.5750|65.20|4.0900|  1|296.0|  15.30|396.90| 4.98|24.0|     4.0|
| 0.02731| 0.00| 7.070|   0|0.4690|6.4210|78.90|4.9671|  2|242.0|  17.80|396.90| 9.14|21.6|     3.0|
| 0.02729| 0.00| 7.070|   0|0.4690|7.1850|61.10|4.9671|  2|242.0|  17.80|392.83| 4.03|34.7|     6.0|
| 0.03237| 0.00| 2.180|   0|0.4580|6.9980|45.80|6.0622|  3|222.0|  18.70|394.63| 2.94|33.4|     6.0|
| 0.06905| 0.00| 2.180|   0|0.4580|7.1470|54.20|6.0622|  3|222.0|  18.70|396.90| 5.33|36.2|     7.0|
| 0.02985| 0.00| 2.180|   0|0.4580|6.4300|58.70|6.0622|  3|222.0|  18.70|394.12| 5.21|28.7|     5.0|
| 0.08829|1

In [None]:
# Check the result
df.select(col("medv"), col("medv_bin")).show()

outcomes = df.select(col("medv_bin"))

outcomes.distinct().show()

outcomes.count()

+----+--------+
|medv|medv_bin|
+----+--------+
|24.0|     4.0|
|21.6|     3.0|
|34.7|     6.0|
|33.4|     6.0|
|36.2|     7.0|
|28.7|     5.0|
|22.9|     3.0|
|27.1|     4.0|
|18.9|     2.0|
|15.0|     2.0|
|18.9|     2.0|
|21.7|     3.0|
|20.4|     3.0|
|18.2|     2.0|
|19.9|     3.0|
|23.1|     3.0|
|17.5|     2.0|
|20.2|     3.0|
|18.2|     2.0|
|13.6|     1.0|
+----+--------+
only showing top 20 rows

+--------+
|medv_bin|
+--------+
|     8.0|
|     0.0|
|     7.0|
|     1.0|
|     4.0|
|     3.0|
|     2.0|
|     6.0|
|     5.0|
|     9.0|
+--------+



450

In [None]:
discrete_df = spark.read.format("csv").option("header", True).option("delimiter", "\t").load("housing_discrete.csv")
print(discrete_df.count())
discrete_df.show(5)

490
+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+
|crim| zn|indus|chas|nox| rm|age|dis|rad|tax|ptratio|black|lstat|
+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+
|   0|  1|    0|   0|  2|  2|  1|  2|  0|  1|      0|    3|    0|
|   0|  0|    1|   0|  1|  2|  2|  2|  0|  0|      1|    3|    1|
|   0|  0|    1|   0|  1|  3|  1|  2|  0|  0|      1|    2|    0|
|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    2|    0|
|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    3|    0|
+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col, when

# example_slice_filter = (col("indus") == 0) & (col("dis") == 2)
# example_slice_df = discrete_df.filter(example_slice_filter)
discrete_df.show()

example_slice_df = discrete_df.withColumn("slice", 
                   when((col("indus") == 0) & (col("dis") == 2), True)
                   .otherwise(False))

print(example_slice_df.count(), "examples in slice")
example_slice_df.show()

+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+
|crim| zn|indus|chas|nox| rm|age|dis|rad|tax|ptratio|black|lstat|
+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+
|   0|  1|    0|   0|  2|  2|  1|  2|  0|  1|      0|    3|    0|
|   0|  0|    1|   0|  1|  2|  2|  2|  0|  0|      1|    3|    1|
|   0|  0|    1|   0|  1|  3|  1|  2|  0|  0|      1|    2|    0|
|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    2|    0|
|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    3|    0|
|   0|  0|    0|   0|  1|  2|  1|  3|  0|  0|      1|    2|    0|
|   0|  1|    1|   0|  1|  1|  1|  3|  2|  1|      0|    2|    2|
|   0|  1|    1|   0|  1|  1|  3|  3|  2|  1|      0|    3|    3|
|   0|  1|    1|   0|  1|  0|  3|  3|  2|  1|      0|    1|    3|
|   0|  1|    1|   0|  1|  1|  2|  3|  2|  1|      0|    1|    2|
|   0|  1|    1|   0|  1|  2|  3|  3|  2|  1|      0|    2|    3|
|   0|  1|    1|   0|  1|  1|  2|  3|  2|  1|      0|    3|    2|
|   0|  1|

In [None]:
from pyspark.sql.functions import count, sum, col, log2

outcomes.printSchema()

grouped_data = df.groupBy("medv_bin").count()
grouped_data = grouped_data.withColumn("count", col("count").cast("int"))
grouped_data.show()
total_count = grouped_data.agg(sum("count")).collect()[0][0]

probabilities = grouped_data.withColumn("probability", ((col("count") / total_count) * log2(col("count") / total_count)))
probabilities.show()

entropy = probabilities.agg(sum("probability")).collect()[0][0]

# filter out null values before taking the logarithm

grouped_data.printSchema()
grouped_data.count()
total_count

print ("entropy: ", entropy)

root
 |-- medv_bin: double (nullable = true)

+--------+-----+
|medv_bin|count|
+--------+-----+
|     8.0|    8|
|     0.0|   17|
|     7.0|   12|
|     1.0|   39|
|     4.0|   73|
|     3.0|  152|
|     2.0|   83|
|     6.0|   26|
|     5.0|   33|
|     9.0|    7|
+--------+-----+

+--------+-----+--------------------+
|medv_bin|count|         probability|
+--------+-----+--------------------+
|     8.0|    8|-0.10335611006608067|
|     0.0|   17|-0.17854980433207523|
|     7.0|   12|-0.13943516507989018|
|     1.0|   39| -0.3057928442707484|
|     4.0|   73|-0.42566407591244987|
|     3.0|  152| -0.5289105756034771|
|     2.0|   83| -0.4498123690427096|
|     6.0|   26|-0.23765972955549902|
|     5.0|   33|-0.27642171860296283|
|     9.0|    7|-0.09343329752025784|
+--------+-----+--------------------+

root
 |-- medv_bin: double (nullable = true)
 |-- count: integer (nullable = false)

entropy:  -2.7390356899861508


In [None]:
outcomes_df = spark.read.format("csv").option("header", True).option("delimiter", "\t").load("outcome_bins.csv")
outcomes_df.printSchema()
outcomes_df = outcomes_df.toDF("medv_bin")
outcomes_df.printSchema()

grouped_data = outcomes_df.groupBy("medv_bin").count()
grouped_data = grouped_data.withColumn("count", col("count").cast("int"))
grouped_data.show()
total_count = grouped_data.agg(sum("count")).collect()[0][0]

probabilities = grouped_data.withColumn("probability", ((col("count") / total_count) * log2(col("count") / total_count)))
probabilities.show()

entropy = probabilities.agg(sum("probability")).collect()[0][0]

# filter out null values before taking the logarithm

grouped_data.printSchema()
grouped_data.count()
total_count

print ("entropy: ", entropy)

root
 |-- 0: string (nullable = true)

root
 |-- medv_bin: string (nullable = true)

+--------+-----+
|medv_bin|count|
+--------+-----+
|       7|   30|
|      11|    1|
|       3|   81|
|       8|   13|
|       5|   96|
|       6|   39|
|       9|    8|
|       1|   21|
|      10|    6|
|       4|  147|
|       2|   48|
+--------+-----+

+--------+-----+--------------------+
|medv_bin|count|         probability|
+--------+-----+--------------------+
|       7|   30| -0.2467192251057583|
|      11|    1|-0.01823803661020933|
|       3|   81| -0.4292649445419462|
|       8|   13|-0.13891954463510045|
|       5|   96| -0.4607364123979914|
|       6|   39|-0.29060855731729096|
|       9|    8|-0.09692470104493994|
|       1|   21|  -0.194756593552449|
|      10|    6|-0.07777561761160916|
|       4|  147| -0.5210896782498619|
|       2|   48|-0.32832738987246507|
+--------+-----+--------------------+

root
 |-- medv_bin: string (nullable = true)
 |-- count: integer (nullable = false)

ent

In [None]:
from pyspark.sql.functions import monotonically_increasing_id 

example_slice_df = example_slice_df.withColumn("id", monotonically_increasing_id())
outcomes_df = outcomes_df.withColumn("id", monotonically_increasing_id())

merged_df = example_slice_df.join(outcomes_df, "id")

merged_df.show()

example_slice_df = merged_df.filter((col("slice") == "true")).select("medv_bin")

example_slice_df.count()
example_slice_df.show()

+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+-----+--------+
| id|crim| zn|indus|chas|nox| rm|age|dis|rad|tax|ptratio|black|lstat|slice|medv_bin|
+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+-----+--------+
|  0|   0|  1|    0|   0|  2|  2|  1|  2|  0|  1|      0|    3|    0| true|       5|
|  1|   0|  0|    1|   0|  1|  2|  2|  2|  0|  0|      1|    3|    1|false|       4|
|  2|   0|  0|    1|   0|  1|  3|  1|  2|  0|  0|      1|    2|    0|false|       7|
|  3|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    2|    0|false|       7|
|  4|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    3|    0|false|       8|
|  5|   0|  0|    0|   0|  1|  2|  1|  3|  0|  0|      1|    2|    0|false|       6|
|  6|   0|  1|    1|   0|  1|  1|  1|  3|  2|  1|      0|    2|    2|false|       5|
|  7|   0|  1|    1|   0|  1|  1|  3|  3|  2|  1|      0|    3|    3|false|       6|
|  8|   0|  1|    1|   0|  1|  0|  3|  3|  2|  1|      0|    1|  

In [None]:
grouped_data = example_slice_df.groupBy("medv_bin").count()
grouped_data = grouped_data.withColumn("count", col("count").cast("int"))
grouped_data.show()
total_count = grouped_data.agg(sum("count")).collect()[0][0]

probabilities = grouped_data.withColumn("probability", ((col("count") / total_count) * log2(col("count") / total_count)))
probabilities.show()

slice_entropy = probabilities.agg(sum("probability")).collect()[0][0]

# filter out null values before taking the logarithm

grouped_data.printSchema()
grouped_data.count()
total_count

print ("entropy: ", entropy)

+--------+-----+
|medv_bin|count|
+--------+-----+
|       7|    2|
|       3|    8|
|       8|    1|
|       5|    3|
|       6|    2|
|       4|   10|
|       2|    1|
+--------+-----+

+--------+-----+--------------------+
|medv_bin|count|         probability|
+--------+-----+--------------------+
|       7|    2|-0.27813981497507173|
|       3|    8| -0.5199666673076944|
|       8|    1|-0.17610694452457293|
|       5|    3| -0.3522138890491458|
|       6|    2|-0.27813981497507173|
|       4|   10| -0.5307257063985579|
|       2|    1|-0.17610694452457293|
+--------+-----+--------------------+

root
 |-- medv_bin: string (nullable = true)
 |-- count: integer (nullable = false)

entropy:  -2.3113997817546874
