In [1]:
import os
print(os.getcwd())
path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)
print(os.getcwd())

C:\Users\danie\Documents\COGS402\cogs402longformer\Huggingfacetutorial
C:\Users\danie\Documents\COGS402\cogs402longformer


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig

from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer

import torch

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
from transformers import LongformerForSequenceClassification, LongformerTokenizer, LongformerConfig
# replace <PATH-TO-SAVED-MODEL> with the real path of the saved model
model_path = 'models/longformer-finetuned_papers/checkpoint-2356'

# load model
model = LongformerForSequenceClassification.from_pretrained(model_path, num_labels = 2)
model.to(device)
model.eval()
model.zero_grad()

# load tokenizer
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [5]:
def predict(inputs, token_type_ids=None, position_ids=None):
    output = model(inputs, token_type_ids=token_type_ids,
                 position_ids=position_ids)
    return output.logits

In [6]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [7]:
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    text_ids = tokenizer.encode(text, truncation = True, add_special_tokens=False, max_length = 256)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

In [8]:
def custom_forward(inputs):
    preds = predict(inputs)
    print(torch.softmax(preds, dim = 1)[0][0])
    return torch.softmax(preds, dim = 1)[0][0].unsqueeze(0)

In [9]:
lig = LayerIntegratedGradients(custom_forward, model.longformer.embeddings)

In [10]:
from datasets import load_dataset
cogs402_ds = load_dataset("danielhou13/cogs402dataset")["test"]

Using custom data configuration danielhou13--cogs402dataset-5c7aa10e6c95142f
Reusing dataset parquet (C:\Users\danie\.cache\huggingface\datasets\parquet\danielhou13--cogs402dataset-5c7aa10e6c95142f\0.0.0\0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
testval = 125
text = cogs402_ds['text'][testval]
label = cogs402_ds['labels'][testval]
print(label)

1


In [12]:
input_ids, ref_input_ids, sep_id = construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [13]:
# model(input_ids)

In [14]:
# predict(input_ids)

In [15]:
# custom_forward(input_ids)

In [16]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    return_convergence_delta=True,
                                    internal_batch_size = 2)

tensor(0.0005, device='cuda:0')
tensor(0.9026, device='cuda:0')
tensor(0.9030, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9057, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9108, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9172, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9145, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9062, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9069, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9111, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9213, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9489, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9607, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9601, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9562, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9524, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9587, device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.9813, device='cuda:0', grad_fn=<SelectBack

In [17]:
def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

In [18]:
attributions_sum = summarize_attributions(attributions)

In [19]:
score = predict(input_ids)

In [20]:
# storing couple samples in an array for visualization purposes
score_vis = viz.VisualizationDataRecord(
                        attributions_sum,
                        torch.softmax(score, dim = 1)[0][0],
                        torch.argmax(torch.softmax(score, dim = 1)[0]),
                        label,
                        text,
                        attributions_sum.sum(),       
                        all_tokens,
                        delta)

print('\033[1m', 'Visualization For Score', '\033[0m')
viz.visualize_text([score_vis])

[1m Visualization For Score [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.00),"Learn-Memorize-Recall-Reduce: A Robotic Cloud Computing Paradigm Shaoshan Liu, Bolin Ding, Jie Tang, Dawei Sun, Zhe Zhang, Grace Tsai, and Jean-Luc Gaudiot ABSTRACT The rise of robotic applications has led to the generation of a huge volume of unstructured data, whereas the current cloud infrastructure was designed to process limited amounts of structured data. To address this problem, we propose a learn-memorize-recall-reduce paradigm for robotic cloud computing. The learning stage converts incoming unstructured data into structured data; the memorization stage provides effective storage for the massive amount of data; the recall stage provides efficient means to retrieve the raw data; while the reduction stage provides means to make sense of this massive amount of unstructured data with limited computing resources. Keywords Cloud Architecture; Distributed Computing; Storage; Robotics 1. INTRODUCTION Robots are mobile devices, and the rise of robotic applications has imposed tremendous pressure on our existing cloud infrastructure. For instance, every day, a mobile phone sends out at least 30 MB of structured data to the clouds of the service provider, the application operators etc. (structured data means data that can be easily understood, stored, and retrieved by machines). In contrast, even a very simple robot can easily generate over 1 GB of unstructured multimedia data per day. An extreme form of robot, driverless cars, can generate as much as 2 GB of unstructured data per second [1] (unstructured data means data that cannot be easily understood, stored, and retrieved by machines). Therefore, we are facing the urgent challenge of designing and implementing a cloud architecture to process this massive robotic unstructured data. For instance, in a real-world scenario, in-home service robots may act in a surveillance role, patrolling the users’ house and recording captured videos. Then, users demand the capability of video playback based on intelligent queries using time, location, as well as objects in scene as inputs. In sections 2, 3, and 4, we first present our implementation of a robotic cloud infrastructure and discuss how this infrastructure can meet this challenge. Specifically, we generalize the requirements into a robotic cloud computing paradigm - learn-memorize-recall, where learning is about how to automatically understand the unstructured data and convert it into structured data; memorization is about how to effectively store the massive amount of data, while recall is about how to efficiently retrieve the data when needed. Nonetheless, as the volumes of data collected from robotic devices exponentially grow over time, having a great robotic cloud infrastructure is not sufficient. Then in section 5, we delve into the final part of the proposed paradigm, reduction, the utilization of data reduction techniques to making sense of massive amounts of unstructured data with a limited budget of computing resources. 2. ROBOTIC CLOUD ARCHITECTURE To provide the learn-memorize-recall features, we have designed and implemented a cloud architecture, which is illustrated in Figure 1 below. The architecture consists of the following components: • Robotic Client Devices: these devices capture multimedia feeds and send the feeds to the cloud along with their meta data. • Streaming Server: this server handles incoming multimedia and streams on-demand live multimedia feeds to users as requested. • Object Recognition: this is a deep-learning evaluation engine for automatic extraction of semantic information from incoming videos. • Key-Value Store: this key-value store organizes the video feeds along with the learned/extracted semantic information. • Query Engine: this query engine supports retrieval of video feeds. One can search using any combination of time, location, as well as extracted labels. • Business Analytics Engine: this engine generates highlevel statistics of all multimedia data. For example, one can be interested in knowing which are the most common objects appear in living rooms, etc. • Storage Layer: the storage layer needs to provide high throughput for data persistence and low latency for fast retrieval of video feeds. Also, it must manage heterogeneous storage systems including S3, GCS, Swift, HDFS, OSS, GlusterFS, and NFS. of labels>>, and the value is a file path of the raw data in the storage layer. This poses two challenges: first, in real-world scenarios, users vary in their choice of persistent storage for their data. Some prefer Amazon S3, some their own deployment of HDFS, others Ceph, etc. One way to get around this problem is to create a set of APIs for each persistent storage, but this would become impossible to manage as the number of persistent storage options grows. A second, and probably better way, to handle this is to create a unified storage layer to abstract all underlying storages. Figure 1: Robotic Cloud Architecture Overview 2.1 Learning As multimedia data comes in to the cloud system, the first task is to learn from the raw data and to extract semantic information out of the multimedia data. For video streams, we can extract object labels from frames and associate these labels with the video stream. For audio streams, we can extract sentences of the spoken language. Then this semantic information can be used as keys, and the raw data streams can be used as values, and together they are stored in the key-value store. To this end, Alluxio enables effective data management across different storage systems through its use of transparent naming and mounting API [6]. Transparent naming maintains an identity between the Alluxio namespace and the underlying storage system namespace. When users create objects in the Alluxio namespace, they can decide whether these objects should be persisted in the underlying storage system. For objects that are persisted, Alluxio preserve the object paths, relative to the underlying storage system directory in which Alluxio objects are stored. With this feature, we can now manage multiple persistent storages using a single set of storage layer API, which greatly simplifies the management of the memorization part of the robotic cloud computing paradigm. The second challenge is that the write throughput directly impacts the performance of the whole system. If the write speed of the memorization is slower than the detection speed in learning stage, then it becomes the bottleneck and leads to “memory loss.” We will discuss in the next section how we can use Alluxio’s tiered storage feature, along with write optimization to improve write throughput. 2.3 Recall Figure 2: Video Stream Processing As presented in Figure 2, we need a learning engine that extracts object labels from a video stream. This engine needs to be accurate in terms of recognition rate, and this engine needs to be fast in order for us to capture as many objects as possible in the video. Therefore, in this implementation, we utilize faster r-cnn [3] network running on Caffe [4]. Faster r-cnn introduces a Region Proposal Network (RPN), a fullyconvolutional network that simultaneously predicts object bounds and object scores at each position, hence achieving very high detection speed without sacrificing detection accuracy. 2.2 Memorization After extracting the semantic information from the raw multimedia data, the extracted labels, along with the raw data get stored in the key-value store for easy retrieval (Figure 3). We implemented the key-value store using MongoDB [5]. In this case, the key is the meta data including > is selected into the store with a probability determined by the “list of labels”—we can select more sample rows for some hot and more important labels and less sample rows for the others. Online sampling during recall. When a query is issued in the recall stage, we can invoke a sampler to select only a subset of rows from the key-value store into the query execution engine (and of course, rewrite the query to rescale the answer). For example, we may want to count the number of times a label appears between a time interval. A number of sampler can be borrowed from AQP systems [12] to provide estimates of answers and error bounds for such queries. Note that we can either adopt all the three proposals at the same time or only some of them, based on the different types of underlying tasks and the amount of resource budget. Finally, it is important to note that we always want to guarantee that the errors are no more than some threshold in our analytical results or answers, or at least, we want to provide estimates of the errors. Such guarantees or estimations are relatively easier for SQL-like aggregation queries (e.g., counting the number of cars on a street), but become more challenging for complex tasks (e.g., predicting the trajectory of a car). We will start with adopting the sampling-based data reduction techniques in our robotic cloud to provide cheap and fast data-analytics services for users with relatively simpler analytical workloads, and extend them for more types of tasks in future. 6. CONCLUSIONS We are facing the data explosion problem from robotic applications. Unlike existing mobile devices, robots generate massive amount of unstructured data, which can not be easily understood, stored, and retrieved by machines. To address this problem, in this paper, we proposed a learn-memorize-recall-reduce paradigm for robotic clouds: the learning stage extracts semantic information from incoming unstructured data and convert it into structured data; the memorization stage provides effective storage for the massive amount of data; the recall stage provides efficient means to retrieve the raw data; and the reduction stage provides means to making sense of massive amount of unstructured data with limited computing resources. 7. REFERENCES [1] S. Liu, J. Tang, C. Wang, Q. Wang, J-L. Gaudiot, Implementing a Cloud Platform for Autonomous Driving. arXiv preprint arXiv:1704.02696, 2017 [2] V. N. Padmanabhan and J. C. Mogul, Using predictive prefetching to improve world wide web latency, Computer Communications Review, vol. 26, pp. 22-36, July 1996. [3] S. Ren, K. He, R. Girshick, and J. Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497, 2015. [4] Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell. Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093, 2014 [5] K. Chodorow, MongoDB: the definitive guide. O’Reilly Media Inc, 2013 [6] H. Li, A. Ghodsi, M. Zaharia, S. Shenker, and I. Stoica. Reliable, Memory Speed Storage for Cluster Computing Frameworks. In Proc. SoCC, 2014. [7] Amazon S3, 2017. Amazon Simple Storage Service. Available from http://aws.amazon.com/s3/ [8] D. Borthakur: HDFS architecture guide. HADOOP APACHE PROJECT. 2008. [9] S. Weil, S. Brandt, E. Miller, D. Long, and C. Maltzahn, Ceph: A Scalable, High-Performance Distributed File System, Proceedings of the 7th Conference on Operating Systems Design and Implementation, November 2006. [10] B. Ding, S. Huang, S. Chaudhuri, K. Chakrabarti, and C. Wang, Sample + Seek: Approximating Aggregates with Distribution Precision Guarantee, In Proc. SIGMOD, 2016. [11] S. Kandula, A. Shanbhag, A. Vitorovic, M. Olma, R. Grandl, S. Chaudhuri, and B. Ding, Quickr: Lazily Approximating Complex Ad-Hoc Queries in Big Data Clusters, In Proc. SIGMOD, 2016. [12] S. Chaudhuri, B. Ding, and S. Kandula, Approximate Query Processing: No Silver Bullet, In Proc. SIGMOD, 2017. [13] S, Agarwal, B. Mozafari, A. Panda, H. Milner, S. Madden, and I. Stoica, BlinkDB: Queries with Bounded Errors and Bounded Response Times on Very Large Data, In Proc. EuroSys, 2013. Dr. Shaoshan Liu is currently the Co-Founder of PerceptIn, working on developing the next-generation robotics platform. Contact him at: shaoshan.liu@perceptin.io Dr. Bolin Ding is currently a Researcher in the Data Management, Exploration and Mining group at Microsoft Research, Redmond. Contact him at: bolin.ding@microsoft.com Dr. Jie Tang is the corresponding author and she is currently an associate professor in the School of Computer Science and Engineering of South China University of Technology, Guangzhou, China. Contact her at: cstangjie@scut.edu.cn Dawei Sun is currently with Tsinghua University and PerceptIn, working on Deep Learning and cloud infrastructures and autonomous robots. Contact him at: sdw14@mails.tsinghua.edu.cn Dr. Zhe Zhang is the co-founder of PerceptIn, working on developing the next-generation robotics platform. Contact him at: zhe.zhang@perceptin.io Dr. Grace Tsai is a founding engineer of PerceptIn, working on developing the next-generation robotics platform. Contact her at: grace.tsai@perceptin.io Dr. Jean-Luc Gaudiot is professor in the Electrical Engineering and Computer Science Department at the University of California, Irvine and is currently serving as the 2017 President of the IEEE Computer Society. Contact him at gaudiot@uci.edu",-5.47,"#s Learn - Mem or ize - Rec all - Red uce : ĠA ĠRob otic ĠCloud ĠComputing ĠParad igm ĠSha osh an ĠLiu , ĠBol in ĠDing , ĠJ ie ĠTang , ĠDaw ei ĠSun , ĠZ he ĠZhang , ĠGrace ĠTs ai , Ġand ĠJean - Luc ĠG aud iot Ġ ĠAB STR ACT ĠThe Ġrise Ġof Ġrobotic Ġapplications Ġhas Ġled Ġto Ġthe Ġgeneration Ġof Ġa Ġhuge Ġvolume Ġof Ġun struct ured Ġdata , Ġwhereas Ġthe Ġcurrent Ġcloud Ġinfrastructure Ġwas Ġdesigned Ġto Ġprocess Ġlimited Ġamounts Ġof Ġstructured Ġdata . ĠTo Ġaddress Ġthis Ġproblem , Ġwe Ġpropose Ġa Ġlearn - mem or ize - rec all - red uce Ġparadigm Ġfor Ġrobotic Ġcloud Ġcomputing . ĠThe Ġlearning Ġstage Ġconverts Ġincoming Ġun struct ured Ġdata Ġinto Ġstructured Ġdata ; Ġthe Ġmemor ization Ġstage Ġprovides Ġeffective Ġstorage Ġfor Ġthe Ġmassive Ġamount Ġof Ġdata ; Ġthe Ġrecall Ġstage Ġprovides Ġefficient Ġmeans Ġto Ġretrieve Ġthe Ġraw Ġdata ; Ġwhile Ġthe Ġreduction Ġstage Ġprovides Ġmeans Ġto Ġmake Ġsense Ġof Ġthis Ġmassive Ġamount Ġof Ġun struct ured Ġdata Ġwith Ġlimited Ġcomputing Ġresources . Ġ ĠKey words ĠCloud ĠArchitecture ; ĠDist ributed ĠComputing ; ĠStorage ; ĠRobotics Ġ Ġ1 . ĠIN TR ODUCT ION ĠRobots Ġare Ġmobile Ġdevices , Ġand Ġthe Ġrise Ġof Ġrobotic Ġapplications Ġhas Ġimposed Ġtremendous Ġpressure Ġon Ġour Ġexisting Ġcloud Ġinfrastructure . ĠFor Ġinstance , Ġevery Ġday , Ġa Ġmobile Ġphone Ġsends Ġout Ġat Ġleast Ġ30 ĠMB Ġof Ġstructured Ġdata Ġto Ġthe Ġclouds Ġof Ġthe Ġservice Ġprovider , Ġthe Ġapplication Ġoperators Ġetc . Ġ( struct ured Ġdata Ġmeans Ġdata Ġthat #/s"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (0.00),"Learn-Memorize-Recall-Reduce: A Robotic Cloud Computing Paradigm Shaoshan Liu, Bolin Ding, Jie Tang, Dawei Sun, Zhe Zhang, Grace Tsai, and Jean-Luc Gaudiot ABSTRACT The rise of robotic applications has led to the generation of a huge volume of unstructured data, whereas the current cloud infrastructure was designed to process limited amounts of structured data. To address this problem, we propose a learn-memorize-recall-reduce paradigm for robotic cloud computing. The learning stage converts incoming unstructured data into structured data; the memorization stage provides effective storage for the massive amount of data; the recall stage provides efficient means to retrieve the raw data; while the reduction stage provides means to make sense of this massive amount of unstructured data with limited computing resources. Keywords Cloud Architecture; Distributed Computing; Storage; Robotics 1. INTRODUCTION Robots are mobile devices, and the rise of robotic applications has imposed tremendous pressure on our existing cloud infrastructure. For instance, every day, a mobile phone sends out at least 30 MB of structured data to the clouds of the service provider, the application operators etc. (structured data means data that can be easily understood, stored, and retrieved by machines). In contrast, even a very simple robot can easily generate over 1 GB of unstructured multimedia data per day. An extreme form of robot, driverless cars, can generate as much as 2 GB of unstructured data per second [1] (unstructured data means data that cannot be easily understood, stored, and retrieved by machines). Therefore, we are facing the urgent challenge of designing and implementing a cloud architecture to process this massive robotic unstructured data. For instance, in a real-world scenario, in-home service robots may act in a surveillance role, patrolling the users’ house and recording captured videos. Then, users demand the capability of video playback based on intelligent queries using time, location, as well as objects in scene as inputs. In sections 2, 3, and 4, we first present our implementation of a robotic cloud infrastructure and discuss how this infrastructure can meet this challenge. Specifically, we generalize the requirements into a robotic cloud computing paradigm - learn-memorize-recall, where learning is about how to automatically understand the unstructured data and convert it into structured data; memorization is about how to effectively store the massive amount of data, while recall is about how to efficiently retrieve the data when needed. Nonetheless, as the volumes of data collected from robotic devices exponentially grow over time, having a great robotic cloud infrastructure is not sufficient. Then in section 5, we delve into the final part of the proposed paradigm, reduction, the utilization of data reduction techniques to making sense of massive amounts of unstructured data with a limited budget of computing resources. 2. ROBOTIC CLOUD ARCHITECTURE To provide the learn-memorize-recall features, we have designed and implemented a cloud architecture, which is illustrated in Figure 1 below. The architecture consists of the following components: • Robotic Client Devices: these devices capture multimedia feeds and send the feeds to the cloud along with their meta data. • Streaming Server: this server handles incoming multimedia and streams on-demand live multimedia feeds to users as requested. • Object Recognition: this is a deep-learning evaluation engine for automatic extraction of semantic information from incoming videos. • Key-Value Store: this key-value store organizes the video feeds along with the learned/extracted semantic information. • Query Engine: this query engine supports retrieval of video feeds. One can search using any combination of time, location, as well as extracted labels. • Business Analytics Engine: this engine generates highlevel statistics of all multimedia data. For example, one can be interested in knowing which are the most common objects appear in living rooms, etc. • Storage Layer: the storage layer needs to provide high throughput for data persistence and low latency for fast retrieval of video feeds. Also, it must manage heterogeneous storage systems including S3, GCS, Swift, HDFS, OSS, GlusterFS, and NFS. of labels>>, and the value is a file path of the raw data in the storage layer. This poses two challenges: first, in real-world scenarios, users vary in their choice of persistent storage for their data. Some prefer Amazon S3, some their own deployment of HDFS, others Ceph, etc. One way to get around this problem is to create a set of APIs for each persistent storage, but this would become impossible to manage as the number of persistent storage options grows. A second, and probably better way, to handle this is to create a unified storage layer to abstract all underlying storages. Figure 1: Robotic Cloud Architecture Overview 2.1 Learning As multimedia data comes in to the cloud system, the first task is to learn from the raw data and to extract semantic information out of the multimedia data. For video streams, we can extract object labels from frames and associate these labels with the video stream. For audio streams, we can extract sentences of the spoken language. Then this semantic information can be used as keys, and the raw data streams can be used as values, and together they are stored in the key-value store. To this end, Alluxio enables effective data management across different storage systems through its use of transparent naming and mounting API [6]. Transparent naming maintains an identity between the Alluxio namespace and the underlying storage system namespace. When users create objects in the Alluxio namespace, they can decide whether these objects should be persisted in the underlying storage system. For objects that are persisted, Alluxio preserve the object paths, relative to the underlying storage system directory in which Alluxio objects are stored. With this feature, we can now manage multiple persistent storages using a single set of storage layer API, which greatly simplifies the management of the memorization part of the robotic cloud computing paradigm. The second challenge is that the write throughput directly impacts the performance of the whole system. If the write speed of the memorization is slower than the detection speed in learning stage, then it becomes the bottleneck and leads to “memory loss.” We will discuss in the next section how we can use Alluxio’s tiered storage feature, along with write optimization to improve write throughput. 2.3 Recall Figure 2: Video Stream Processing As presented in Figure 2, we need a learning engine that extracts object labels from a video stream. This engine needs to be accurate in terms of recognition rate, and this engine needs to be fast in order for us to capture as many objects as possible in the video. Therefore, in this implementation, we utilize faster r-cnn [3] network running on Caffe [4]. Faster r-cnn introduces a Region Proposal Network (RPN), a fullyconvolutional network that simultaneously predicts object bounds and object scores at each position, hence achieving very high detection speed without sacrificing detection accuracy. 2.2 Memorization After extracting the semantic information from the raw multimedia data, the extracted labels, along with the raw data get stored in the key-value store for easy retrieval (Figure 3). We implemented the key-value store using MongoDB [5]. In this case, the key is the meta data including > is selected into the store with a probability determined by the “list of labels”—we can select more sample rows for some hot and more important labels and less sample rows for the others. Online sampling during recall. When a query is issued in the recall stage, we can invoke a sampler to select only a subset of rows from the key-value store into the query execution engine (and of course, rewrite the query to rescale the answer). For example, we may want to count the number of times a label appears between a time interval. A number of sampler can be borrowed from AQP systems [12] to provide estimates of answers and error bounds for such queries. Note that we can either adopt all the three proposals at the same time or only some of them, based on the different types of underlying tasks and the amount of resource budget. Finally, it is important to note that we always want to guarantee that the errors are no more than some threshold in our analytical results or answers, or at least, we want to provide estimates of the errors. Such guarantees or estimations are relatively easier for SQL-like aggregation queries (e.g., counting the number of cars on a street), but become more challenging for complex tasks (e.g., predicting the trajectory of a car). We will start with adopting the sampling-based data reduction techniques in our robotic cloud to provide cheap and fast data-analytics services for users with relatively simpler analytical workloads, and extend them for more types of tasks in future. 6. CONCLUSIONS We are facing the data explosion problem from robotic applications. Unlike existing mobile devices, robots generate massive amount of unstructured data, which can not be easily understood, stored, and retrieved by machines. To address this problem, in this paper, we proposed a learn-memorize-recall-reduce paradigm for robotic clouds: the learning stage extracts semantic information from incoming unstructured data and convert it into structured data; the memorization stage provides effective storage for the massive amount of data; the recall stage provides efficient means to retrieve the raw data; and the reduction stage provides means to making sense of massive amount of unstructured data with limited computing resources. 7. REFERENCES [1] S. Liu, J. Tang, C. Wang, Q. Wang, J-L. Gaudiot, Implementing a Cloud Platform for Autonomous Driving. arXiv preprint arXiv:1704.02696, 2017 [2] V. N. Padmanabhan and J. C. Mogul, Using predictive prefetching to improve world wide web latency, Computer Communications Review, vol. 26, pp. 22-36, July 1996. [3] S. Ren, K. He, R. Girshick, and J. Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497, 2015. [4] Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell. Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093, 2014 [5] K. Chodorow, MongoDB: the definitive guide. O’Reilly Media Inc, 2013 [6] H. Li, A. Ghodsi, M. Zaharia, S. Shenker, and I. Stoica. Reliable, Memory Speed Storage for Cluster Computing Frameworks. In Proc. SoCC, 2014. [7] Amazon S3, 2017. Amazon Simple Storage Service. Available from http://aws.amazon.com/s3/ [8] D. Borthakur: HDFS architecture guide. HADOOP APACHE PROJECT. 2008. [9] S. Weil, S. Brandt, E. Miller, D. Long, and C. Maltzahn, Ceph: A Scalable, High-Performance Distributed File System, Proceedings of the 7th Conference on Operating Systems Design and Implementation, November 2006. [10] B. Ding, S. Huang, S. Chaudhuri, K. Chakrabarti, and C. Wang, Sample + Seek: Approximating Aggregates with Distribution Precision Guarantee, In Proc. SIGMOD, 2016. [11] S. Kandula, A. Shanbhag, A. Vitorovic, M. Olma, R. Grandl, S. Chaudhuri, and B. Ding, Quickr: Lazily Approximating Complex Ad-Hoc Queries in Big Data Clusters, In Proc. SIGMOD, 2016. [12] S. Chaudhuri, B. Ding, and S. Kandula, Approximate Query Processing: No Silver Bullet, In Proc. SIGMOD, 2017. [13] S, Agarwal, B. Mozafari, A. Panda, H. Milner, S. Madden, and I. Stoica, BlinkDB: Queries with Bounded Errors and Bounded Response Times on Very Large Data, In Proc. EuroSys, 2013. Dr. Shaoshan Liu is currently the Co-Founder of PerceptIn, working on developing the next-generation robotics platform. Contact him at: shaoshan.liu@perceptin.io Dr. Bolin Ding is currently a Researcher in the Data Management, Exploration and Mining group at Microsoft Research, Redmond. Contact him at: bolin.ding@microsoft.com Dr. Jie Tang is the corresponding author and she is currently an associate professor in the School of Computer Science and Engineering of South China University of Technology, Guangzhou, China. Contact her at: cstangjie@scut.edu.cn Dawei Sun is currently with Tsinghua University and PerceptIn, working on Deep Learning and cloud infrastructures and autonomous robots. Contact him at: sdw14@mails.tsinghua.edu.cn Dr. Zhe Zhang is the co-founder of PerceptIn, working on developing the next-generation robotics platform. Contact him at: zhe.zhang@perceptin.io Dr. Grace Tsai is a founding engineer of PerceptIn, working on developing the next-generation robotics platform. Contact her at: grace.tsai@perceptin.io Dr. Jean-Luc Gaudiot is professor in the Electrical Engineering and Computer Science Department at the University of California, Irvine and is currently serving as the 2017 President of the IEEE Computer Society. Contact him at gaudiot@uci.edu",-5.47,"#s Learn - Mem or ize - Rec all - Red uce : ĠA ĠRob otic ĠCloud ĠComputing ĠParad igm ĠSha osh an ĠLiu , ĠBol in ĠDing , ĠJ ie ĠTang , ĠDaw ei ĠSun , ĠZ he ĠZhang , ĠGrace ĠTs ai , Ġand ĠJean - Luc ĠG aud iot Ġ ĠAB STR ACT ĠThe Ġrise Ġof Ġrobotic Ġapplications Ġhas Ġled Ġto Ġthe Ġgeneration Ġof Ġa Ġhuge Ġvolume Ġof Ġun struct ured Ġdata , Ġwhereas Ġthe Ġcurrent Ġcloud Ġinfrastructure Ġwas Ġdesigned Ġto Ġprocess Ġlimited Ġamounts Ġof Ġstructured Ġdata . ĠTo Ġaddress Ġthis Ġproblem , Ġwe Ġpropose Ġa Ġlearn - mem or ize - rec all - red uce Ġparadigm Ġfor Ġrobotic Ġcloud Ġcomputing . ĠThe Ġlearning Ġstage Ġconverts Ġincoming Ġun struct ured Ġdata Ġinto Ġstructured Ġdata ; Ġthe Ġmemor ization Ġstage Ġprovides Ġeffective Ġstorage Ġfor Ġthe Ġmassive Ġamount Ġof Ġdata ; Ġthe Ġrecall Ġstage Ġprovides Ġefficient Ġmeans Ġto Ġretrieve Ġthe Ġraw Ġdata ; Ġwhile Ġthe Ġreduction Ġstage Ġprovides Ġmeans Ġto Ġmake Ġsense Ġof Ġthis Ġmassive Ġamount Ġof Ġun struct ured Ġdata Ġwith Ġlimited Ġcomputing Ġresources . Ġ ĠKey words ĠCloud ĠArchitecture ; ĠDist ributed ĠComputing ; ĠStorage ; ĠRobotics Ġ Ġ1 . ĠIN TR ODUCT ION ĠRobots Ġare Ġmobile Ġdevices , Ġand Ġthe Ġrise Ġof Ġrobotic Ġapplications Ġhas Ġimposed Ġtremendous Ġpressure Ġon Ġour Ġexisting Ġcloud Ġinfrastructure . ĠFor Ġinstance , Ġevery Ġday , Ġa Ġmobile Ġphone Ġsends Ġout Ġat Ġleast Ġ30 ĠMB Ġof Ġstructured Ġdata Ġto Ġthe Ġclouds Ġof Ġthe Ġservice Ġprovider , Ġthe Ġapplication Ġoperators Ġetc . Ġ( struct ured Ġdata Ġmeans Ġdata Ġthat #/s"
,,,,
