# Set-up

## Import Datalogue libraries

Note, you'll need to have downloaded and installed the Datalogue SDK before this step will work.

Right now, to do so you will need to get access through Artifactory.

In [43]:
# Import Datalogue libraries 
from datalogue import *
from datalogue.version import __version__
from datalogue.models.ontology import *
from datalogue.models.datastore_collection import *
from datalogue.models.datastore import *
from datalogue.models.datastore import GCSDatastoreDef 
from datalogue.models.credentials import *
from datalogue.models.stream import *
from datalogue.models.transformations import *
from datalogue.models.transformations.structure import *
from datalogue.dtl import Dtl, DtlCredentials
from datalogue.models.training import DataRef

# Import Datalogue Bag of Tricks
from DTLBagOTricks import DTL as DTLHelper


# Import other useful libraries
from datetime import datetime, timedelta
from os import environ
import pandas
from IPython.display import Image

# Checks the version of the SDK is correct
# The expected version is 0.28.3
# If the SDK is not installed, run `! pip install datalogue` and restart the Jupyter Notebook kernel
# If the wrong versions is installed, run `! pip install datalogue --upgrade` and restart the Jupyter Notebook kernel
__version__

'0.28.3'

In [44]:
# Set host, username and password variables

datalogue_host = "https://internal.dtl.systems"  # for connecting to internal (note)

# datalogue_host = "https://internal.dtl.systems"  # for connecting to internal (note)
# datalogue_host = "http://10.2.161.119:3000"  # for connecting to Eric's DGX
email = ""
password = ""

# Log in to Datalogue
BOT = DTLHelper(datalogue_host, email, password)
dtl = BOT.dtl

# Expected output Datalogue v0.28.3
# "Logged in '[host location]' with '[username]' account)"

Datalogue v0.28.3
Logged in 'https://internal.dtl.systems/api' with 'chrisr@datalogue.io' account)


## 2. Read Source Files from S3 bucket

In [45]:
from boto.s3.connection import S3Connection

conn = S3Connection('AWS Access Key','AWS Secret Key')
bucket = conn.get_bucket('bucket-name')

keys = ["store_name", "URL"]
npl_data = []

for key in bucket.list():
    if 'Loan' in key.name:
        url="https://datalogue-demo.s3.amazonaws.com/" + key.name
        values = ["demo-"+key.name, url]
        npl_data.append(dict(zip(keys, values)))


<class 'list'>
<class 'dict'>


In [46]:

print("\nCSV Customers Sources to connect to:\n" "-------------------------")
for data_store in npl_data:
    print("➜ " + data_store["store_name"])
print("\n")



CSV Customers Sources to connect to:
-------------------------
➜ demo-LoanTape1USD.csv
➜ demo-LoanTape2USD.csv
➜ demo-LoanTape3EUR.csv
➜ demo-LoanTape4GBP.csv
➜ demo-LoanTape5USD.csv




## 3. Create datastore connections for each file in S3 bucket

In [47]:
current_stores = []

for data_store in npl_data:
    data_store["datastore_object"] = dtl.datastore.create(
        Datastore(
            data_store["store_name"],
            HttpDatastoreDef(data_store["URL"], FileFormat.Csv),
        )
    )
    current_stores.append(data_store["datastore_object"])


<class 'list'>
<class 'datalogue.models.datastore.Datastore'>


###           3b. Create datastore for RDBMS target

In [48]:
# host: 34.74.11.127 (use jdbc:postgresql://34.74.11.127:5432/demo for creating target store)
# user: postgres
# pw: L8am0pO5zjJrFm2O

# bug in SDK for v<1.0; to be updated here but created in GUI for now


## 4. Collecting data stores into a collection

This is just used for organization, and uses the command `dtl.datastore_collection.create`.

In [49]:
npl_collection = DatastoreCollection(
  name ="collectionName",
  storeIds = [Datastore["datastore_object"].id for Datastore in npl_data],
  description = "NPL tape data of various formats"
)


In [50]:
npl_collection2 = dtl.datastore_collection.create(npl_collection)

In [51]:
# Checking that the collection was created

input_store_list = []

for dataset in dtl.datastore_collection.list():
    if (dataset.name == "collectionName"):
        print(dataset.id, "|", dataset.name, "|")
        print("Datastores in Dataset:")
        for datastore in dataset.storeIds:
            print("* " + datastore["name"] + " --> " + datastore["id"])
            input_store_list.append(datastore["id"])
        print(
            "------------------------------------------------------------------------------------"
        )

print('input store id list: ' + str(input_store_list))

5bb0253e-a184-4fd6-ab6b-2dd507b14374 | demo-NPL Collection |
Datastores in Dataset:
* demo-LoanTape2USD.csv --> 0c7d336e-a625-4bf1-906e-b3f66a7cca09
* demo-LoanTape1USD.csv --> 51223aad-12ff-4336-89a6-2eb5cd35c272
* demo-LoanTape3EUR.csv --> a122e970-70b8-4fc2-8406-a8d80982e4ef
------------------------------------------------------------------------------------
3e7a2dfa-6a94-42a3-a86f-7021f89d60d8 | demo-NPL Collection |
Datastores in Dataset:
* demo-LoanTape3EUR.csv --> 50d64640-2251-4432-92e3-d5ead126fe7d
* demo-LoanTape1USD.csv --> d15bc2b0-2cd2-45ea-8b6c-50cb22c8367a
* demo-LoanTape2USD.csv --> ea0e139a-fdfe-409f-9fd6-273f0c4bc6a4
------------------------------------------------------------------------------------
77c60112-b491-4901-a584-30e036ae29e8 | demo-NPL Collection |
Datastores in Dataset:
* demo-LoanTape2USD.csv --> 3a565393-2844-47d0-be54-b9a5fee9ada2
* demo-LoanTape5USD.csv --> 9b7f5b18-6b83-4e9c-8e62-461b59c4456c
* demo-LoanTape4GBP.csv --> decfb804-32c3-4462-ba3a-346bf1

## 5. Creating a stream


In [52]:
# Check if any existing now 
#for stream in dtl.stream_collection.list():
#    print('Stream name: ', stream.name )
#    print('Stream ID: ', stream.id)
#print('-----------------------')
#print('Total of ', len(dtl.stream_collection.list()), ' stream(s).')


In [53]:
my_output_store = dtl.datastore.get("5d4f4f1b-55d6-4ef0-823e-d37d7f58dc3f")


#### Sample pipeline

In [55]:
# Define the target output schema transformation using 'structure'

std_schema = Structure([
        ClassNodeDescription(
            path = ["LoanID"],
            tag = "Loan ID",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Unpaid_Principal"],
            tag = "Unpaid Principal Balance",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Orig_Val"],
            tag = "Origination Value",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Int_Type"],
            tag = "Interest Type",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Mat_Date"],
            tag = "Maturity Date",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Currency"],
            tag = "Currency",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Orig_Country"],
            tag = "Country of Origination",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        ),
        ClassNodeDescription(
            path = ["Days_Past_Due"],
            tag = "Days Past Due",
            pick_strategy = PickStrategy.HighScore,
            data_type = DataType.String
        )
    ]
)

In [56]:
# Define classify transformation

from datalogue.models.transformations import ReplaceLabel

tx_definition = Definition(    # (List[Transformation], pipelines: List['Definition'], target_datastore )
            [
                Classify(use_context=True, include_classes=False, include_scores=False),
                std_schema
            ], # List of transformations
            [], # pipelines list
            my_output_store, # target_datastore
        )

In [57]:
print(type(tx_definition))

<class 'datalogue.models.stream.Definition'>


In [58]:
# Define n stream(s), where n is number of datastore connections created from S3 bucket scan
n = len(current_stores)
i = 1

list_of_streams = []
for i in range(n):
    stream = Stream(current_stores[i], [tx_definition])
    i += 1
    list_of_streams.append(stream)

print(type(list_of_streams))
print(type(list_of_streams[0]))    


<class 'list'>
<class 'datalogue.models.stream.Stream'>


In [60]:
# search the existing stream collections
x=dtl.stream_collection.list()
scid = ''
for i in x:
    if i.name == 'NPLDemoPipeline':
        scid = i.id

# delete the existing stream collection
dtl.stream_collection.delete(stream_collection_id = scid)

StreamCollection(id: b8156451-8045-4d6c-a863-ae3d2ff32782, name: 'NPLDemoPipeline', streams: [StreamMetadata(id: 776f90e0-9d7b-4246-86c0-13c234f9a4c6, is_ready: True, stream: Stream(type: <datalogue.models.datastore.HttpDatastoreDef object at 0x0000024A1D7B2E80>, pipelines: [Pipeline(type: [Classify(paths: , options: UseContext), Structure(structure: [ClassNodeDescription(path: ['LoanID'], tag: Loan ID, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Unpaid_Principal'], tag: Unpaid Principal Balance, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Orig_Val'], tag: Origination Value, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Int_Type'], tag: Interest Type, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescription(path: ['Mat_Date'], tag: Maturity Date, strategy: PickStrategy.HighScore, dataType: DataType.String), ClassNodeDescrip

True

In [62]:
# Put the streams in a collection

stream_collection2 = dtl.stream_collection.create(
    list_of_streams,
    "StreamCollectionName"
)

<class 'list'>
<class 'datalogue.models.stream.Stream'>


In [63]:
# Run the Collection

dtl.stream_collection.run(stream_collection2.id)

True