# Set-up

## Import Datalogue libraries

Note, you'll need to have downloaded and installed the Datalogue SDK before this step will work.

Right now, to do so you will need to get access through Artifactory.

In [22]:
# Import Datalogue libraries 
from datalogue import *
from datalogue.version import __version__
from datalogue.models.ontology import *
from datalogue.models.datastore_collection import *
from datalogue.models.datastore import *
from datalogue.models.datastore import GCSDatastoreDef 
from datalogue.models.credentials import *
from datalogue.models.stream import *
from datalogue.models.transformations import *
from datalogue.models.transformations.structure import *
from datalogue.dtl import Dtl, DtlCredentials
from datalogue.models.training import DataRef

# Import Datalogue Bag of Tricks
from DTLBagOTricks import DTL as DTLHelper


# Import other useful libraries
from datetime import datetime, timedelta
from os import environ
import pandas
from IPython.display import Image

# Checks the version of the SDK is correct
# The expected version is 0.28.3
# If the SDK is not installed, run `! pip install datalogue` and restart the Jupyter Notebook kernel
# If the wrong versions is installed, run `! pip install datalogue --upgrade` and restart the Jupyter Notebook kernel
__version__

'0.30.2'

In [23]:
# Set host, username and password variables

datalogue_host = "https://internal.dtl.systems"  # for connecting to internal (note)

#email = environ.get("DTL_EMAIL")
email = "chrisr@datalogue.io"
#password = environ.get("DTL_PASSWORD")
password = "StreudelSauce1!"

# Log in to Datalogue
BOT = DTLHelper(datalogue_host, email, password)
dtl = BOT.dtl

# Expected output Datalogue v0.28.3
# "Logged in '[host location]' with '[username]' account)"

Datalogue v0.30.2
Logged in 'https://internal.dtl.systems/api' with 'chrisr@datalogue.io' account.


In [24]:
# First, let's clean up the assets this workbook creates from previous runs

# Warning! this will clean all your datastores and data collections and credentials

#BOT.server_summary()

# Clear Datastores and Datastore Collections
for store in dtl.datastore.list():
#    print(store.name, ',', store.name[:5])
    if (store.name[:5] == 'demo-'):
        dtl.datastore.delete(store.id)

for store in dtl.datastore_collection.list():
#    print(store.name)
    if (store.name[:5] == 'demo-'):    
        dtl.datastore_collection.delete(store.id)

# Clear data pipelines
for StreamCollection in dtl.stream_collection.list():
#    print(StreamCollection.name, '\n')
    if (StreamCollection.name[:5] == 'demo-'):
        dtl.stream_collection.delete(StreamCollection.id)

## Clear ontologies
# for Ontology in dtl.ontology.list():
#     dtl.ontology.delete(ontology.id)

#BOT.server_summary()

# After running the above, the Stores and Collections variables should both be 0

## 2. Read Source Files from S3 bucket

In [25]:
from boto.s3.connection import S3Connection

conn = S3Connection('AKIAIXM6CXHGHC62R7GA','Gcb34qctsvPoQJGGDrXzmwMbyaCZOg6zY1RFOVQO')
bucket = conn.get_bucket('datalogue-demo')

keys = ["store_name", "URL"]
telem_data = []

for key in bucket.list():
    if 'telemetry_' in key.name:
        url="https://datalogue-demo.s3.amazonaws.com/" + key.name
        values = ["demo-"+key.name, url]
        telem_data.append(dict(zip(keys, values)))

print(type(telem_data))
print(type(telem_data[0]))
print(telem_data)


<class 'list'>
<class 'dict'>
[{'store_name': 'demo-mobile/telemetry/telemetry_EU_MOVI.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_EU_MOVI.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_EU_O2.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_EU_O2.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_EU_ORNG.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_EU_ORNG.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_US_ATT.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_US_ATT.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_US_SPR.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_US_SPR.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_US_TMOB.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_US_TMOB.csv'}, {'store_name': 'demo-mobile/telemetry/telemetry_US_VZW.csv', 'URL

In [26]:

print("\nCSV Machine Sources to connect to:\n" "-------------------------")
for data_store in telem_data:
    print("➜ " + data_store["store_name"])
print("\n")



CSV Machine Sources to connect to:
-------------------------
➜ demo-mobile/telemetry/telemetry_EU_MOVI.csv
➜ demo-mobile/telemetry/telemetry_EU_O2.csv
➜ demo-mobile/telemetry/telemetry_EU_ORNG.csv
➜ demo-mobile/telemetry/telemetry_US_ATT.csv
➜ demo-mobile/telemetry/telemetry_US_SPR.csv
➜ demo-mobile/telemetry/telemetry_US_TMOB.csv
➜ demo-mobile/telemetry/telemetry_US_VZW.csv




## 3. Create datastore connections for each file in S3 bucket

In [27]:
current_stores = []

for data_store in telem_data:
#    print(data_store["store_name"], "---", data_store["store_name"][-4:])
    if (data_store["store_name"][-4:] == '.csv'):
        data_store["datastore_object"] = dtl.datastore.create(
            Datastore(
                data_store["store_name"],
                HttpDatastoreDef(data_store["URL"], FileFormat.Csv),))
    if (data_store["store_name"][-4:] == '.xml'):
        data_store["datastore_object"] = dtl.datastore.create(
            Datastore(
                data_store["store_name"],
                HttpDatastoreDef(data_store["URL"], FileFormat.Xml),))
    if (data_store["store_name"][-4:] == 'json'):
        data_store["datastore_object"] = dtl.datastore.create(
            Datastore(
                data_store["store_name"],
                HttpDatastoreDef(data_store["URL"], FileFormat.Json),))

    current_stores.append(data_store["datastore_object"])

print(type(current_stores))
print(type(current_stores[0]))

print(data_store)


<class 'list'>
<class 'datalogue.models.datastore.Datastore'>
{'store_name': 'demo-mobile/telemetry/telemetry_US_VZW.csv', 'URL': 'https://datalogue-demo.s3.amazonaws.com/mobile/telemetry/telemetry_US_VZW.csv', 'datastore_object': Datastore(id: 714b9616-706e-40b7-bb94-1c13f9427d2b, name: 'demo-mobile/telemetry/telemetry_US_VZW.csv', alias: None, credential_id: None, definition: <datalogue.models.datastore.HttpDatastoreDef object at 0x00000244C620AD30>, samples: None, schema_paths: [], schema_labels: [], schema_nodes: None)}


###           3b. Create datastore for RDBMS target

In [28]:
# host: 34.74.11.127 (use jdbc:postgresql://34.74.11.127:5432/demo for creating target store)
# user: postgres
# pw: L8am0pO5zjJrFm2O

# bug in SDK for v<1.0; to be updated here but created in GUI for now


## 4. Collecting data stores into a collection

This is just used for organization, and uses the command `dtl.datastore_collection.create`.

In [29]:
telem_collection = DatastoreCollection(
  name ="demo-Telemetry Collection",
  storeIds = [Datastore["datastore_object"].id for Datastore in telem_data],
  description = "Global Handset Telemetry Data"
)
telem_collection2 = dtl.datastore_collection.create(telem_collection)

## 5. Creating a stream


In [30]:
# Check if any existing now 
#for stream in dtl.stream_collection.list():
#    print('Stream name: ', stream.name )
#    print('Stream ID: ', stream.id)
#print('-----------------------')
#print('Total of ', len(dtl.stream_collection.list()), ' stream(s).')



In [31]:
#set target of stream:

my_output_store = dtl.datastore.get("9c0b89e6-0e27-4f89-8c8d-e521b4b424cd")

print(my_output_store)

Datastore(id: 9c0b89e6-0e27-4f89-8c8d-e521b4b424cd, name: 'dtl-demo telemetry', alias: None, credential_id: 87f2cf75-f182-45f5-84c1-db0926aae084, definition: <datalogue.models.datastore.JdbcDatastoreDef object at 0x00000244C5A9A080>, samples: None, schema_paths: [], schema_labels: [], schema_nodes: None)


In [32]:
print(type(my_output_store))
print(my_output_store)

<class 'datalogue.models.datastore.Datastore'>
Datastore(id: 9c0b89e6-0e27-4f89-8c8d-e521b4b424cd, name: 'dtl-demo telemetry', alias: None, credential_id: 87f2cf75-f182-45f5-84c1-db0926aae084, definition: <datalogue.models.datastore.JdbcDatastoreDef object at 0x00000244C5A9A080>, samples: None, schema_paths: [], schema_labels: [], schema_nodes: None)


#### Sample pipeline

In [33]:
# Define the target output schema transformation using 'structure'

std_schema = Structure([
        ClassNodeDescription(
            path = ["Model"],
            tag = "Model",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Manufacturer"],
            tag = "Manufacturer",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["OS"],
            tag = "OS",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Generation"],
            tag = "Generation",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Language"],
            tag = "Language",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Timestamp"],
            tag = "Timestamp",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Signal_Strength"],
            tag = "Signal_Strength",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Latitude"],
            tag = "Latitude",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Transmitted_Bytes"],
            tag = "Transmitted_Bytes",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Longitude"],
            tag = "Longitude",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Received_Bytes"],
            tag = "Received_Bytes",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Region"],
            tag = "Region",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["City"],
            tag = "City",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Country"],
            tag = "Country",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Provider"],
            tag = "Provider",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        )
    ]
)

In [34]:
from datalogue.models.training import *
import uuid

modelUuid = dtl.training.get_trainings(uuid.UUID("9b39556e-d2d5-48c6-9a70-dec37579bf6e"))[0].id

print(type(modelUuid))
print(modelUuid)


<class 'uuid.UUID'>
b15e4b20-430a-4fa4-b429-56f3ab79b3c2


In [35]:
# Define classify transformation

tx_definition = Definition(    # (List[Transformation], pipelines: List['Definition'], target_datastore )
            [
                Classify(training_id = modelUuid, use_context=True, include_classes=False, include_scores=False),
                std_schema
            ], # List of transformations
        [], # pipelines list
            my_output_store, # target_datastore
        )

print(type(tx_definition))
print(tx_definition)

<class 'datalogue.models.stream.Definition'>
Pipeline(type: [Classify(training_id: b15e4b20-430a-4fa4-b429-56f3ab79b3c2, paths: , options: UseContext), Structure(structure: [ClassNodeDescription(path: ['Model'], tag: Model, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Manufacturer'], tag: Manufacturer, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['OS'], tag: OS, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Generation'], tag: Generation, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Language'], tag: Language, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Timestamp'], tag: Timestamp, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Signal_Strength'], tag: Signal_Strength, strategy: PickStrategy.HighScore, dataType: DataType

In [36]:
# Define n stream(s), where n is number of datastore connections created from S3 bucket scan
n = len(current_stores)
i = 1

list_of_streams = []
for i in range(n):
    stream = Stream(current_stores[i], [tx_definition])
    i += 1
    list_of_streams.append(stream)

print(type(list_of_streams))
print(type(list_of_streams[0]))    
print(list_of_streams[0])

<class 'list'>
<class 'datalogue.models.stream.Stream'>
Stream(type: <datalogue.models.datastore.HttpDatastoreDef object at 0x00000244C643A240>, pipelines: [Pipeline(type: [Classify(training_id: b15e4b20-430a-4fa4-b429-56f3ab79b3c2, paths: , options: UseContext), Structure(structure: [ClassNodeDescription(path: ['Model'], tag: Model, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Manufacturer'], tag: Manufacturer, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['OS'], tag: OS, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Generation'], tag: Generation, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Language'], tag: Language, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Timestamp'], tag: Timestamp, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeD

In [37]:
print(type(current_stores[0]))
print(current_stores[0])
print("\n")

print(len(list_of_streams))

<class 'datalogue.models.datastore.Datastore'>
Datastore(id: ddab4a9c-87f6-451f-94b0-a3c06baac1c1, name: 'demo-mobile/telemetry/telemetry_EU_MOVI.csv', alias: None, credential_id: None, definition: <datalogue.models.datastore.HttpDatastoreDef object at 0x00000244C643A240>, samples: None, schema_paths: [], schema_labels: [], schema_nodes: None)


7


In [38]:
# search the existing stream collections
#x=dtl.stream_collection.list()
#scid = ''
#for i in x:
#    if i.name == 'MFGDemoPipeline':
#        print(i, "\n")
#        scid = i.id
#print("deleting stream ID: ", scid)

# delete the existing stream collection if it exists
#dtl.stream_collection.delete(stream_collection_id = scid)

In [39]:
# print(type(stream1))

In [40]:
# Put the streams in a collection

stream_collection = dtl.stream_collection.create(
    list_of_streams,
    'demo-Telemetry Pipeline'
)

print(type(stream_collection))
print(stream_collection)

<class 'datalogue.models.stream_collection.StreamCollection'>
StreamCollection(id: 9d210246-3e43-4026-ad82-44de404efeb2, name: 'demo-Telemetry Pipeline', streams: [StreamMetadata(id: f14035ca-210a-46d1-a0f1-cc68a6ab550c, is_ready: True, stream: Stream(type: <datalogue.models.datastore.HttpDatastoreDef object at 0x00000244C620A240>, pipelines: [Pipeline(type: [Classify(training_id: b15e4b20-430a-4fa4-b429-56f3ab79b3c2, paths: , options: UseContext), Structure(structure: [ClassNodeDescription(path: ['Model'], tag: Model, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Manufacturer'], tag: Manufacturer, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['OS'], tag: OS, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Generation'], tag: Generation, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Language'], tag: Language, str

In [41]:
# deploy the model 

#dtl.training.deploy(trainingId, '9b39556e-d2d5-48c6-9a70-dec37579bf6e')

In [42]:
# Run the Collection

dtl.stream_collection.run(stream_collection.id)

[Job(id: UUID('1fcb3f29-c9ba-449a-b8d8-b1a904aab203'), stream_id: UUID('f14035ca-210a-46d1-a0f1-cc68a6ab550c'), stream_collection_id: UUID('9d210246-3e43-4026-ad82-44de404efeb2'), status: Scheduled, run_at: datetime.datetime(2019, 10, 15, 16, 10, 54, tzinfo=tzutc()), created_by: UUID('5b333964-8fab-4ab0-9052-25f69fcb8689'), remaining_time_millis: 9223372036854775807, percent_progress: 0, errors: None), ended_at: None,
 Job(id: UUID('1e542d29-e1b0-4e1d-b560-0eb91dfdebcf'), stream_id: UUID('fb552a02-037c-4e06-8ed0-2ff22a8b87e1'), stream_collection_id: UUID('9d210246-3e43-4026-ad82-44de404efeb2'), status: Scheduled, run_at: datetime.datetime(2019, 10, 15, 16, 10, 54, tzinfo=tzutc()), created_by: UUID('5b333964-8fab-4ab0-9052-25f69fcb8689'), remaining_time_millis: 9223372036854775807, percent_progress: 0, errors: None), ended_at: None,
 Job(id: UUID('c99a1b12-92df-41a9-b255-f261d94ca408'), stream_id: UUID('bb930a9f-6d17-448f-8413-09ecbdcafcf2'), stream_collection_id: UUID('9d210246-3e43-40