# Uncertainty Sampling

Uncertainty Sampling is one [Active Learning](https://en.wikipedia.org/wiki/Active_learning_(machine_learning))
strategy to use the uncertainty in model detection to find examples to be labelled.


In [1]:
# From https://pytorch.org/vision/0.11/models.html#object-detection-instance-segmentation-and-person-keypoint-detection

COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [2]:
import mlflow
from pyspark.sql import SparkSession

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
spark = (
    SparkSession
    .builder
    .config("spark.jars.packages", "ai.eto:rikai_2.12:0.1.0")
    .config(
        "spark.sql.extensions",
        "ai.eto.rikai.sql.spark.RikaiSparkSessionExtensions",
    )
    .config(
        "spark.rikai.sql.ml.registry.mlflow.tracking_uri",
        MLFLOW_TRACKING_URI,
    )
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "4g")
    .master("local[2]")
    .getOrCreate()
);

22/01/27 14:11:30 WARN Utils: Your hostname, station resolves to a loopback address: 127.0.1.1; using 172.16.0.2 instead (on interface enp6s0)
22/01/27 14:11:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/lei/.ivy2/cache
The jars for the packages stored in: /home/lei/.ivy2/jars
ai.eto#rikai_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-900039a8-bc61-4d1b-ae62-92a57b9e25e2;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/lei/miniconda3/envs/coco/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found ai.eto#rikai_2.12;0.1.0 in central
	found org.antlr#antlr4-runtime;4.8-1 in central
	found com.thoughtworks.enableIf#enableif_2.12;1.1.7 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found com.typesafe.scala-logging#scala-logging_2.12;3.9.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found io.circe#circe-core_2.12;0.12.3 in central
	found io.circe#circe-numbers_2.12;0.12.3 in central
	found org.typelevel#cats-core_2.12;2.0.0 in central
	found org.typelevel#cats-macros_2.12;2.0.0 in central
	found org.typelevel#cats-kernel_2.12;2.0.0 in central
	found io.circe#circe-generic_2.12;0.12.3 in central
	found com.chuusai#shapeless_2.12;2.3.3 in central
	found org.typelevel#macro-compat_2.12;1.1.1 in central
	found io.circe#circe-parser_2.12;0.12.3 in central
	found io.circe#circe-jawn_2.12;0.12.3 in central
	found org.typelevel#jawn-parser_2.12;0.14.2 in central
	found org.mlflow#mlflow-client;1.21.0 in central
	found org.apache.logging.log4j#log4j-co

# Preparing data

Use rikai.contrib.coco.convert to create a Coco Rikai dataset stored under "./coco"

In [3]:
df = spark.read.format("rikai").load("coco")
df.registerTempTable("coco")

In [4]:
spark.sql("SHOW TABLES").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |     coco|       true|
+--------+---------+-----------+



In [5]:
spark.sql("select * from coco").printSchema()

root
 |-- date_captured: string (nullable = true)
 |-- width: long (nullable = true)
 |-- height: long (nullable = true)
 |-- file_name: string (nullable = true)
 |-- image_id: long (nullable = true)
 |-- annotations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- image_id: long (nullable = true)
 |    |    |-- area: double (nullable = true)
 |    |    |-- label_id: long (nullable = true)
 |    |    |-- ann_id: long (nullable = true)
 |    |    |-- bbox: box2d (nullable = true)
 |    |    |-- segmentation: mask (nullable = true)
 |    |    |-- supercategory: string (nullable = true)
 |    |    |-- label: string (nullable = true)
 |-- image_inline: image (nullable = true)
 |-- image: image (nullable = true)



In [6]:
import rikai
from torchvision.models.detection.ssd import ssd300_vgg16
from rikai.contrib.torch.inspect.ssd import SSDClassScoresExtractor
from rikai.contrib.torch.detections import OUTPUT_SCHEMA

ssd = ssd300_vgg16(pretrained=True)
class_scores_extractor = SSDClassScoresExtractor(ssd, topk_candidates=90)

print(OUTPUT_SCHEMA)

with mlflow.start_run():
    rikai.mlflow.pytorch.log_model(
        ssd, 
        "model", 
        OUTPUT_SCHEMA,
        pre_processing="rikai.contrib.torch.transforms.ssd.pre_processing",
        post_processing="rikai.contrib.torch.transforms.ssd.post_processing",
        registered_model_name="ssd"
    )
with mlflow.start_run():
    rikai.mlflow.pytorch.log_model(
        class_scores_extractor,
        "model_scores",
        SSDClassScoresExtractor.SCHEMA,
        pre_processing="rikai.contrib.torch.inspect.ssd.class_scores_extractor_pre_processing",
        post_processing="rikai.contrib.torch.inspect.ssd.class_scores_extractor_post_processing",
        registered_model_name="class_scores"
    )

array<struct<box:box2d, score:float, label_id:int>>


Registered model 'ssd' already exists. Creating a new version of this model...
2022/01/27 14:11:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: ssd, version 5
Created version '5' of model 'ssd'.
Registered model 'class_scores' already exists. Creating a new version of this model...
2022/01/27 14:11:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: class_scores, version 5
Created version '5' of model 'class_scores'.


In [7]:
spark.sql("CREATE OR REPLACE MODEL ssd OPTIONS (batch_size=128) USING 'mlflow:/ssd'")
spark.sql("CREATE OR REPLACE MODEL class_scores OPTIONS (batch_size=128) USING 'mlflow:/class_scores'")

DataFrame[]

In [8]:
spark.sql("SHOW MODELS").show()


+------------+------+--------------------+--------------------+
|        name|flavor|                 uri|             options|
+------------+------+--------------------+--------------------+
|         ssd|      |         mlflow:/ssd|{"batch_size":"128"}|
|class_scores|      |mlflow:/class_scores|{"batch_size":"128"}|
+------------+------+--------------------+--------------------+



# Least Confidence

**Least Confidence** looks for predicted labels with the lowest degree of confidence

In [9]:
df = spark.sql("""
SELECT image_id, image, explode(ML_PREDICT(ssd, image)) AS ssd FROM (
    SELECT image_id, image FROM coco LIMIT 2000
) ORDER BY ssd.score ASC
""").cache()

Query took 0.063s


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
                                                                                

Unnamed: 0,image_id,image,ssd
0,236762,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((386.9280700683594, 306.27435302734375, 489.9..."
1,437900,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((228.3543701171875, 77.6168441772461, 284.191..."
2,286690,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((322.5680847167969, 61.59935760498047, 418.09..."
3,579491,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((67.3131332397461, 208.40911865234375, 202.24..."
4,277788,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((5.052701950073242, 300.9232482910156, 196.42..."
5,445960,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((270.5899353027344, 52.707969665527344, 480.2..."
6,76101,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((270.4062194824219, 346.3324890136719, 294.95..."
7,262059,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((209.10028076171875, 108.93500518798828, 365...."
8,418812,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((301.00592041015625, 85.65565490722656, 363.2..."
9,37015,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((555.59765625, 246.9672393798828, 597.7020874..."


In [None]:
from rikai.viz import Text

for row in df.take(3):
    text = COCO_INSTANCE_CATEGORY_NAMES[row.ssd.label_id]
    display(row.image 
        | row.ssd.box@{"color": "yellow", "width": 3} 
        | Text(f"{text} | {row.ssd.score:.3f}", (row.ssd.box.xmin, row.ssd.box.ymax + 3))@{"color": "yellow"}
    )


# Least Margin

In [13]:
%%sql

SELECT image_id, image, detection, detection.scores[0] - detection.scores[1] as margin FROM (
    SELECT image_id, image, explode(ML_PREDICT(class_scores, image)) AS detection FROM (
        SELECT image_id, image FROM coco LIMIT 100
    )
) ORDER BY margin

Query took 0.036s


                                                                                

Unnamed: 0,image_id,image,detection,margin
0,22223,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((291.4421691894531, 55.997108459472656, 333.8...",4e-06
1,281809,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((473.949462890625, 158.41424560546875, 496.81...",1.4e-05
2,210175,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((512.45556640625, 355.595703125, 588.63012695...",1.7e-05
3,526962,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((81.70887756347656, 0.0, 182.21600341796875, ...",5.5e-05
4,185513,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((590.937255859375, 314.70623779296875, 640.00...",6.7e-05
5,185513,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((590.937255859375, 314.70623779296875, 640.00...",6.7e-05
6,92188,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((361.7630920410156, 149.69552612304688, 491.1...",9.4e-05
7,526767,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((117.9936752319336, 494.9398498535156, 251.27...",0.000129
8,91784,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((322.6991271972656, 73.09281158447266, 343.25...",0.000156
9,209326,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((533.8094482421875, 4.429168224334717, 635.95...",0.000188


In [14]:
df.printSchema()
df.cache()

root
 |-- image_id: long (nullable = true)
 |-- image: image (nullable = true)
 |-- detection: struct (nullable = true)
 |    |-- box: box2d (nullable = true)
 |    |-- scores: array (nullable = true)
 |    |    |-- element: float (containsNull = true)
 |    |-- label_ids: array (nullable = true)
 |    |    |-- element: integer (containsNull = true)
 |-- margin: float (nullable = true)



DataFrame[image_id: bigint, image: ImageType, detection: struct<box:box2d,scores:array<float>,label_ids:array<int>>, margin: float]

In [15]:
first = df.first()
label1 = COCO_INSTANCE_CATEGORY_NAMES[first.detection.label_ids[0]]
label2 = COCO_INSTANCE_CATEGORY_NAMES[first.detection.label_ids[1]]
text = f"{label1} or {label2}"
box = first.detection.box
(
    first.image 
    | box@{"color": "yellow", "width": 3} 
    | Text(text, (box.xmin, box.ymax))@{"color": "yellow"}
)

                                                                                

TypeError: text() got multiple values for keyword argument 'color'

Draw(style({'color': 'yellow', 'width': 3}))

# Entropy

In [16]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from scipy.stats import entropy as scipyEntropy

@udf(returnType=FloatType())
def entropy(arr) -> float:
    return float(scipyEntropy(arr))

spark.udf.register("entropy", entropy)

<function __main__.entropy(arr) -> float>

In [17]:
%%sql
SELECT image_id, image, detection, entropy(detection.scores) as entropy FROM (
    SELECT image_id, image, explode(ML_PREDICT(class_scores, image)) AS detection FROM (
        SELECT image_id, image FROM coco LIMIT 1000
    )
) ORDER BY entropy DESC

Query took 0.017s


                                                                                

Unnamed: 0,image_id,image,detection,entropy
0,22223,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((291.4421691894531, 55.997108459472656, 333.8...",0.693147
1,210175,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((512.45556640625, 355.595703125, 588.63012695...",0.693147
2,310156,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((207.48019409179688, 19.748699188232422, 221....",0.693147
3,260311,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((396.1365661621094, 101.37675476074219, 454.1...",0.693147
4,260311,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((396.1365661621094, 101.37675476074219, 454.1...",0.693147
5,310156,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((207.48019409179688, 19.748699188232422, 221....",0.693147
6,369878,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((246.79067993164062, 356.8446960449219, 262.5...",0.693147
7,281809,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((473.949462890625, 158.41424560546875, 496.81...",0.693147
8,21632,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((311.2681579589844, 66.98974609375, 339.00177...",0.693147
9,55363,Image(uri=/mnt/data/datasets/coco/train2017/00...,"((293.353515625, 248.77833557128906, 342.00506...",0.693147


In [18]:
df.cache()

DataFrame[image_id: bigint, image: ImageType, detection: struct<box:box2d,scores:array<float>,label_ids:array<int>>, entropy: float]

In [None]:
first = df.first()
text = COCO_INSTANCE_CATEGORY_NAMES[first.detection.label_ids[0]]
box = first.detection.box
print(box)
(
    first.image 
    | box@{"color": "yellow", "width": 3} 
    | Text(text, (box.xmin, box.ymax))@{"color": "yellow"}
)

[Stage 27:>                                                         (0 + 1) / 1]