In [2]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.jars.packages", "io.delta:delta-core_2.11:0.6.0") \
    .getOrCreate()
sc = spark.sparkContext
sc.addPyFile("/usr/lib/spark/jars/delta-core_2.11-0.6.0.jar")
from delta.tables import *


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
import boto3
import os

def load_voc_image_names(bucket_name, prefix):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    files = []
    for obj in bucket.objects.filter(Prefix=os.path.join(prefix, "JPEGImages")):
        if obj.key.endswith('.jpg'):
            files.append(obj.key)
    return(files)
    

keys = load_voc_image_names('david-voc-data', 'v1')


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
import io
import random
random.seed(42)

def load_image_list(bucket_name, prefix, list_name):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    path = os.path.join(prefix, "ImageSets", "Main", f"{list_name}.txt")
    response = bucket.Object(path)
    data = response.get()['Body'].read()
    return data.decode('utf8').split('\n')

train_list = load_image_list('david-voc-data', 'v1', 'train')
val_list = load_image_list('david-voc-data', 'v1', 'val')
random.shuffle(val_list)
val_list = val_list[:1000]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
train_files = [k for k in keys if os.path.basename(k).split('.')[0] in train_list]
val_files = [k for k in keys if os.path.basename(k).split('.')[0] in val_list]
print(f"{len(train_files)} files in training dataset")
print(f"{len(val_files)} files in val dataset")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

5148 files in training dataset
1000 files in val dataset

In [6]:
import os
import io
from PIL import Image
import numpy as np


def readFileFromS3(row):
    import boto
    import os

    s3 = boto3.client('s3')
    bucket = 'david-voc-data'
    key = row.image_key  
    filename = os.path.basename(str(key))
    basename = filename.split('.')[0]
    response = s3.get_object(Bucket=bucket, Key=key)
    body = response["Body"]
    contents = bytearray(body.read())
    body.close()
    
    annotation_key = key.replace("JPEGImages", "Annotations").replace('.jpg', '.xml')
    response = s3.get_object(Bucket=bucket, Key=annotation_key)
    data = response['Body'].read()
    annotations = data.decode('utf8')

    if len(contents):
        return (contents, annotations)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, BinaryType, StringType
from pyspark.sql import Row

def write_to_delta(files, bucket, table, append=False):
    rdd1 = sc.parallelize(files)
    row_rdd = rdd1.map(lambda x: Row(x))
    rows_df = sqlContext.createDataFrame(row_rdd,['image_key'])
    images_rdd = (
      rows_df
      .rdd
      .map(readFileFromS3)
    )

    schema = StructType([StructField("image", BinaryType(), False),
                         StructField("annotations", StringType(), False)]
                        )

    image_df = (
      images_rdd
      .toDF(schema)
    )
    
    if append:
        mode = "append"
    else:
        mode = "overwrite"
    
    (
      image_df
      .write
      .format("delta")
      .mode(mode)
      .option("compression", "gzip")
      .save(f"s3://{bucket}/{table}")
    )


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
write_to_delta(val_files, 'david-voc-delta', 'val')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
write_to_delta(train_files, 'david-voc-delta', 'train')

# Updating our Training Data

From our modeling experiments, we could see that the models performance detecting dogs was poor.  Here we collected an additional ~500 images and we can append them to the Delta Table with a slight code update.

In [17]:
updated_keys = load_voc_image_names('david-voc-data', 'v2')
train_list2 = load_image_list('david-voc-data', 'v2', 'train')
train_files2 = [k for k in updated_keys if os.path.basename(k).split('.')[0] in train_list2]
print(f"{len(train_files2)} files in the new training dataset")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

569 files in the new training dataset

In [None]:
write_to_delta(train_files2, 'david-voc-delta', 'train', append=True)