In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
!ls
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

^C
housing_discrete.csv	   spark-3.1.1-bin-hadoop3.2.tgz
outcome_bins.csv	   spark-3.1.1-bin-hadoop3.2.tgz.1
sample_data		   spark-3.1.1-bin-hadoop3.2.tgz.2
spark-3.1.1-bin-hadoop3.2  spark-3.1.1-bin-hadoop3.2.tgz.3


In [5]:
# importing dataset (discrete dataset for variables as well as outcomes - preprocessing done in jupyter)
from pyspark.sql.functions import monotonically_increasing_id, col, count, sum, col, log2, mean, stddev

import pandas as pd

# NOTE: This cell will change according to dataset. The desired state of df should be a discrete dataset and an id field that starts from 0.
# TODO: Update as per dataset being used

discrete_df = spark.read.format("csv").option("header", True).option("delimiter", "\t").load("housing_discrete.csv")
discrete_df = discrete_df.withColumn("id", monotonically_increasing_id())

outcomes_df = spark.read.format("csv").option("header", True).option("delimiter", "\t").load("outcome_bins.csv")
outcomes_df = outcomes_df.toDF("medv_bin")
outcomes_df = outcomes_df.withColumn("id", monotonically_increasing_id())

df = discrete_df.join(outcomes_df, "id")
df.drop("id")

df.show()

+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+--------+
| id|crim| zn|indus|chas|nox| rm|age|dis|rad|tax|ptratio|black|lstat|medv_bin|
+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+--------+
|  0|   0|  1|    0|   0|  2|  2|  1|  2|  0|  1|      0|    3|    0|       5|
|  1|   0|  0|    1|   0|  1|  2|  2|  2|  0|  0|      1|    3|    1|       4|
|  2|   0|  0|    1|   0|  1|  3|  1|  2|  0|  0|      1|    2|    0|       7|
|  3|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    2|    0|       7|
|  4|   0|  0|    0|   0|  1|  3|  1|  3|  0|  0|      1|    3|    0|       8|
|  5|   0|  0|    0|   0|  1|  2|  1|  3|  0|  0|      1|    2|    0|       6|
|  6|   0|  1|    1|   0|  1|  1|  1|  3|  2|  1|      0|    2|    2|       5|
|  7|   0|  1|    1|   0|  1|  1|  3|  3|  2|  1|      0|    3|    3|       6|
|  8|   0|  1|    1|   0|  1|  0|  3|  3|  2|  1|      0|    1|    3|       3|
|  9|   0|  1|    1|   0|  1|  1|  2|  3|  2|  1|   

In [6]:
# calculate entropy, mean, stddev for outcomes (all rows) which is common irrespective of what slice we pick

# TODO: Update as per dataset being used
# outcome column
outcome_column = 'medv_bin'

outcomes = df.select(outcome_column)
outcomes.show()
grouped_data = outcomes_df.groupBy(outcome_column).count()
grouped_data = grouped_data.withColumn("count", col("count").cast("int"))
grouped_data.show()
total_count = grouped_data.agg(sum("count")).collect()[0][0]

probabilities = grouped_data.withColumn("probability", (-1 * (col("count") / total_count) * log2(col("count") / total_count)))
# probabilities.show()

outcome_entropy = probabilities.agg(sum("probability")).collect()[0][0]
outcome_mean = outcomes.select(mean(outcome_column)).collect()[0][0]
outcome_stddev = df.select(stddev(outcome_column)).collect()[0][0]

print ("outcome_entropy: ", outcome_entropy)
print ("outcome_mean: ", outcome_mean)
print ("outcome_stddev: ", outcome_stddev)

+--------+
|medv_bin|
+--------+
|       5|
|       4|
|       7|
|       7|
|       8|
|       6|
|       5|
|       6|
|       3|
|       4|
|       3|
|       4|
|       4|
|       4|
|       4|
|       4|
|       5|
|       3|
|       4|
|       4|
+--------+
only showing top 20 rows

+--------+-----+
|medv_bin|count|
+--------+-----+
|       7|   30|
|      11|    1|
|       3|   81|
|       8|   13|
|       5|   96|
|       6|   39|
|       9|    8|
|       1|   21|
|      10|    6|
|       4|  147|
|       2|   48|
+--------+-----+

outcome_entropy:  2.8033607009396215
outcome_mean:  4.3244897959183675
outcome_stddev:  1.811915865931041


In [29]:
from pyspark.sql.functions import col, when
import heapq
import random

def calculate_ranked_slices (k, M, feature_columns, number_of_samples):
  # Initialize ranked_slices dictionary
  ranked_slices = {}
  # Initialize score_cache dictionary
  score_cache = {}
  for sample_number in range(number_of_samples):
    random_seed = random.randint(0, 9999)
    sample_row = df.sample(0.1, seed=random_seed).limit(1)
    print ("sample_row:\n\n", sample_row)
    for m in range(1, M + 1):
      if (m == 1):
        for column in feature_columns:
          if (column == 'id'):
            continue
          # print (column)
          column_value = sample_row.first()[column]
          # print(column_value)
          example_slice_df = df.filter(col(column) == column_value)
          slice_outcomes = example_slice_df.select(outcome_column)
          slice_grouped_data = slice_outcomes.groupBy(outcome_column).count()
          slice_grouped_data = slice_grouped_data.withColumn("count", col("count").cast("int"))
          slice_count = slice_grouped_data.agg(sum("count")).collect()[0][0]

          slice_probabilities = slice_grouped_data.withColumn("probability", (-1 * (col("count") / slice_count) * log2(col("count") / slice_count)))

          slice_outcome_entropy = slice_probabilities.agg(sum("probability")).collect()[0][0]
          # slice_outcome_mean = slice_outcomes.select(mean(outcome_column)).collect()[0][0]

          # Check if the current slice's entropy score exceeds any existing entry
          exceed_existing = all(slice_outcome_entropy >= score for score in ranked_slices.values())
          # Add the slice to ranked_slices only if it exceeds any existing entry
          if exceed_existing or len(ranked_slices) < k:
            feature_vals = {}
            feature_vals[column] = column_value
            ranked_slices[frozenset(feature_vals.items())] = slice_outcome_entropy
            # Limit the ranked_slices dictionary to top k entries based on entropy score
            ranked_slices = dict(heapq.nlargest(k, ranked_slices.items(), key=lambda item: item[1]))
          score_cache[frozenset(feature_vals.items())] = slice_outcome_entropy
      else:
        # print ("m is ", m)
        new_ranked_slices = {}
        for base_slice in ranked_slices:
          if len(base_slice) >= M:
            continue
          for column in feature_columns:
            if column == 'id' or column in dict(base_slice):
              continue
            # create new slice for current feature and base slice
            feature_vals = dict(base_slice)
            column_value = sample_row.first()[column]
            feature_vals[column] = column_value

            # skip if the new slice already exists in ranked_slices
            if frozenset(feature_vals.items()) in score_cache:
              continue

            print ("running algo for slice: ", feature_vals.items())

            example_slice_df = df
            for feature, value in feature_vals.items():
                example_slice_df = example_slice_df.filter(col(feature) == value)

            slice_outcomes = example_slice_df.select(outcome_column)
            slice_grouped_data = slice_outcomes.groupBy(outcome_column).count()
            slice_grouped_data = slice_grouped_data.withColumn("count", col("count").cast("int"))
            slice_count = slice_grouped_data.agg(sum("count")).collect()[0][0]
            slice_probabilities = slice_grouped_data.withColumn("probability", (-1 * (col("count") / slice_count) * log2(col("count") / slice_count)))
            slice_outcome_entropy = slice_probabilities.agg(sum("probability")).collect()[0][0]
            new_ranked_slices[frozenset(feature_vals.items())] = slice_outcome_entropy
            score_cache[frozenset(feature_vals.items())] = slice_outcome_entropy

        ranked_slices.update(new_ranked_slices)
        ranked_slices = dict(heapq.nlargest(k, ranked_slices.items(), key=lambda item: item[1]))
  return ranked_slices

In [32]:
# top k slices
k = 10

# max number of features to consider for slice
M = 3

feature_columns = discrete_df.columns

# to consider all features as max number of features, uncomment following:
# M = len(feature_columns)

# number of iterations where each iteration will pick a random sample and run the slicing algo
number_of_samples = 3

ranked_slices = calculate_ranked_slices(k, M, feature_columns, number_of_samples)

sample_row:

 +---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+--------+
| id|crim| zn|indus|chas|nox| rm|age|dis|rad|tax|ptratio|black|lstat|medv_bin|
+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+--------+
| 13|   0|  0|    1|   0|  2|  1|  1|  2|  1|  1|      3|    3|    1|       4|
+---+----+---+-----+----+---+---+---+---+---+---+-------+-----+-----+--------+

running algo for slice:  dict_items([('chas', '0'), ('crim', '0')])
running algo for slice:  dict_items([('chas', '0'), ('zn', '0')])
running algo for slice:  dict_items([('chas', '0'), ('indus', '1')])
running algo for slice:  dict_items([('chas', '0'), ('nox', '2')])
running algo for slice:  dict_items([('chas', '0'), ('rm', '1')])
running algo for slice:  dict_items([('chas', '0'), ('age', '1')])
running algo for slice:  dict_items([('chas', '0'), ('dis', '2')])
running algo for slice:  dict_items([('chas', '0'), ('rad', '1')])
running algo for slice:  dict_items([('chas', '0'), (



```
# This is formatted as code
```

# Resulting ranked slices

In [33]:
import pprint

sorted_ranked_slices = dict(sorted(ranked_slices.items(), key=lambda item: item[1], reverse=True))
pprint.pprint(sorted_ranked_slices)

{frozenset({('chas', '0')}): 2.831358701689329,
 frozenset({('crim', '0')}): 2.6368228152510924,
 frozenset({('chas', '0'), ('zn', '0')}): 2.624276289690301,
 frozenset({('crim', '0'), ('chas', '0')}): 2.6161707946803947,
 frozenset({('zn', '0')}): 2.5985738698922645,
 frozenset({('crim', '0'), ('zn', '0')}): 2.5835914453047373,
 frozenset({('rm', '1'), ('nox', '2'), ('indus', '1')}): 2.550340709546388,
 frozenset({('lstat', '1'), ('dis', '2'), ('ptratio', '3')}): 2.550340709546388,
 frozenset({('crim', '0'), ('chas', '0'), ('zn', '0')}): 2.5386789824456906,
 frozenset({('rm', '1'), ('black', '3'), ('nox', '2')}): 2.5220552088742}
