In [1]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.jars.packages", "io.delta:delta-core_2.11:0.6.0") \
    .getOrCreate()
sc = spark.sparkContext
sc.addPyFile("/usr/lib/spark/jars/delta-core_2.11-0.6.0.jar")
from delta.tables import *
# Enable Arrow support.
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "128")
sc = spark.sparkContext

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1590542943984_0007,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
import os
import shutil
import tarfile
import time
import zipfile

try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

import pandas as pd

import torch
from torch.utils.data import Dataset
from torchvision import datasets, models, transforms
from torchvision.datasets.folder import default_loader  # private API

from pyspark.sql.functions import col, pandas_udf, PandasUDFType
from pyspark.sql.types import ArrayType, FloatType

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from determined.experimental import Determined

checkpoint = Determined(master="http://44.232.66.32:8080/").get_experiment(18).top_checkpoint()
model = checkpoint.load(path="/home/.config/ckpt", map_location=torch.device('cpu'))

model_broadcast = sc.broadcast(model)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [4]:
import boto3
import os

def load_voc_image_names(bucket_name, prefix):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    files = []
    for obj in bucket.objects.filter(Prefix=os.path.join(prefix, "JPEGImages")):
        if obj.key.endswith('.jpg'):
            files.append(obj.key)
    return(files)
    

keys = load_voc_image_names('david-voc-data', 'v1')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
import io

def load_val_list(bucket_name, prefix):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    path = os.path.join(prefix, "ImageSets", "Main", "val.txt")
    response = bucket.Object(path)
    data = response.get()['Body'].read()
    return data.decode('utf8').split('\n')

val_list = load_val_list('david-voc-data', 'v1')
id = range(len(files))
files = [k for k in keys if os.path.basename(k).split('.')[0] in val_list][:400]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
from pyspark.sql import Row
rdd1 = sc.parallelize(zip(id, files))
row_rdd = rdd1.map(lambda x: Row(x[0], x[1]))
rows_df = sqlContext.createDataFrame(row_rdd,['id', 'path']).repartition(8)
rows_df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+
| id|                path|
+---+--------------------+
| 19|v1/JPEGImages/200...|
|143|v1/JPEGImages/200...|
|134|v1/JPEGImages/200...|
| 96|v1/JPEGImages/200...|
|102|v1/JPEGImages/200...|
+---+--------------------+
only showing top 5 rows

In [45]:
import io
from PIL import Image
from torchvision.transforms import Compose, ToTensor

class VOCDataset(Dataset):
  def __init__(self, paths, bucket):
    self.paths = paths
    self.bucket = bucket
    
    transforms = []
    transforms.append(ToTensor())
    self.transform = Compose(transforms)
    
  def __len__(self):
    return len(self.paths)

  def __getitem__(self, index):
    path = self.paths[index]
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=self.bucket, Key=path)
    body = response["Body"]
    contents = bytearray(body.read())
    image = Image.open(io.BytesIO(contents)).convert('RGB')
    if self.transform is not None:
      image = self.transform(image)
    return image

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [46]:
import torch
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, IntegerType, StringType

def collate_fn(batch):
    return tuple(batch)

schema = StructType([StructField("boxes", ArrayType(ArrayType(FloatType()))),
                     StructField("labels", ArrayType(IntegerType())),
                     StructField("scores", ArrayType(FloatType())),
                     StructField("path", StringType())],
                   )

def predict_batch(paths):
    images = VOCDataset(paths, 'david-voc-data')
    loader = torch.utils.data.DataLoader(images, batch_size=2, num_workers=8, collate_fn=collate_fn)
    model = model_broadcast.value
    boxes = []
    labels = []
    scores = []
    paths = []
    with torch.no_grad():
        for images in loader:
            predictions = list(model(list(images)))
            for prediction in predictions:
                boxes.append(prediction['boxes'].cpu().numpy().tolist())
                labels.append(prediction['labels'].cpu().numpy().tolist())
                scores.append(prediction['scores'].cpu().numpy().tolist())
    return {"boxes": boxes, "labels": labels, "scores": scores}

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [47]:
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, IntegerType, StringType


schema = StructType([StructField("boxes", ArrayType(ArrayType(FloatType()))),
                     StructField("labels", ArrayType(IntegerType())),
                     StructField("scores", ArrayType(FloatType())),
                     StructField("path", StringType())],
                   )


predict_udf = f.udf(
    predict_batch, 
    schema
)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
predictions_df = rows_df.select(col('path'), predict_udf(col('path')).alias("prediction"))


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [49]:
predictions_df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

KeyboardInterrupt: 

In [None]:
(
  predictions_df
  .write
  .format("delta")
  .mode("overwrite")
  .option("compression", "gzip")
  .save("s3://david-voc-predictions/predictions")
)

In [42]:
from pyspark.sql.window import Window as w
nparts = 5000
preds = outcome_sdf = (
    rows_df
    .select(
        f.create_map(
            f.col('id'),     
            f.col('path')
        ).alias('features'), 
        (
            f.row_number().over(
                w.partitionBy(f.lit(1)).orderBy(f.lit(1))
            ) % nparts
        ).alias('grouper')
    )
    .groupby(f.col('grouper'))
    .agg(
        f.collect_list(f.col('features')).alias('features')
    )
    .select(
        predict_udf(f.col('features')).alias('results')
    )
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
preds.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o606.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 19.0 failed 4 times, most recent failure: Lost task 0.3 in stage 19.0 (TID 29, ip-172-31-25-116.us-west-2.compute.internal, executor 7): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/mnt/yarn/usercache/livy/appcache/application_1590542943984_0007/container_1590542943984_0007_01_000116/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/mnt/yarn/usercache/livy/appcache/application_1590542943984_0007/container_1590542943984_0007_01_000116/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/mnt/yarn/usercache/livy/appcache/application_1590542943984_0007/container_1590542943984_0007_01_000116/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batch

In [24]:
rows_df = rows_df.repartition(24)
predictions_df = rows_df.groupby("split").apply(predict_udf)
# predictions_df = predictions_df.repartition(predictions_df.rdd.getNumPartitions())
predictions_df.show(50)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o548.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 52.0 failed 4 times, most recent failure: Lost task 2.3 in stage 52.0 (TID 182, ip-172-31-19-189.us-west-2.compute.internal, executor 29): ExecutorLostFailure (executor 29 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  13.0 GB of 11 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2043)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2031)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2030)
	at scala.collection.mutable.ResizableArray$clas

In [17]:
sc._conf.get('spark.executor.memory')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'9486M'