In [1]:
import datarobot as dr
import pandas as pd
import boto3
import time
import os

import my_creds

### Establish DataRobot Environment Connectivty

In [2]:
DR_APP_ENDPOINT = 'https://app.datarobot.com' + '/api/v2/'
API_TOKEN = my_creds.DR_API_TOKEN

dr.Client(token=API_TOKEN, endpoint=DR_APP_ENDPOINT)

<datarobot.rest.RESTClientObject at 0x121d73310>

### Establish AWS Environment Connectivty
Establish connection to AWS using credentials in ~/.aws/credentials
Rather than the default profile, the [support] profile from the credentials file is loaded below.

In [3]:
session = boto3.Session(profile_name='support')

#### Athena Helper Functions
Based on https://gist.github.com/schledererj/b2e2a800998d61af2bbdd1cd50e08b76

fetchall_athena_sql - return dictionary of query results to local environment

fetch_athena_file - download results file to local environment

### Create Project from Paginated SQL Results

In [4]:
# query_string: a SQL-like query that Athena will execute
# client: an Athena client created with boto3
# db: database name
# s3_out_loc: s3 output location
def fetchall_athena_sql(query_string, client, db, s3_out_loc):
    query_id = client.start_query_execution(
        QueryString=query_string,
        QueryExecutionContext={
            'Database': db
        },
        ResultConfiguration={
            'OutputLocation': s3_out_loc
        }
    )['QueryExecutionId']
    
    query_status = None
    
    while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
        query_status = client.get_query_execution(QueryExecutionId=query_id)['QueryExecution']['Status']['State']
        if query_status == 'FAILED' or query_status == 'CANCELLED':
            raise Exception('Athena query with the string "{}" failed or was cancelled'.format(query_string))
        time.sleep(10)
        
    results_paginator = client.get_paginator('get_query_results')
    results_iter = results_paginator.paginate(
        QueryExecutionId=query_id,
        PaginationConfig={
            'PageSize': 1000
        }
    )

    results = []
    column_names = None
    for results_page in results_iter:
        for row in results_page['ResultSet']['Rows']:
           column_values = [col.get('VarCharValue', None) for col in row['Data']]
           if not column_names:
               column_names = column_values
           else:
               results.append(dict(zip(column_names, column_values)))
    return results

In [5]:
athena_client = session.client('athena')
database = 'community_athena_demo_db'
s3_out = 's3://engineering/athena/output/'
query = "select * from loan_history limit 100"

query_results = fetchall_athena_sql(query, athena_client, database, s3_out)

In [6]:
# convert to dataframe to view and manipulate
df = pd.DataFrame(query_results) 
df.head(2)

Unnamed: 0,loan_id,loan_app_date,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_title,...,purpose,title,zip_code,addr_state,dti,initial_list_status,policy_code,is_bad,year,month
0,10001,+47994-03-21 00:00:00.000,16300,16200,60 months,7.29%,416.48,A,A4,Time Warner Cable,...,medical,Medical,766xx,TX,10.87,f,1,0,2016,1
1,10002,+47996-12-15 00:00:00.000,9100,9300,60 months,18.25%,277.15,F,F1,Ottawa University,...,debt_consolidation,My Debt Consolidation Loan,660xx,KS,9.15,f,1,0,2016,1


In [7]:
proj = dr.Project.create(sourcedata=df,
    project_name='athena load query')

# further work with project via the python API, or work in GUI (link to project printed below)
print(DR_APP_ENDPOINT[:-7] + 'projects/{}'.format(proj.id))

https://app.datarobot.com/projects/5e9044e1e2c1


### Create Project from Downloaded Query Results in S3

In [8]:
# query_string: a SQL-like query that Athena will execute
# ath_client: an Athena client created with boto3
# db: database name
# s3_out_loc: s3 output location
# s3_client: an S3 client created with boto3
# loc_path: path to store file locally
def fetch_athena_file(query_string, ath_client, db, s3_out_loc, s3_client, loc_path):
    query_id = ath_client.start_query_execution(
        QueryString=query_string,
        QueryExecutionContext={
            'Database': db
        },
        ResultConfiguration={
            'OutputLocation': s3_out_loc
        }
    )['QueryExecutionId']
    
    query_status = None
    
    while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
        ath_result = ath_client.get_query_execution(QueryExecutionId=query_id)
        query_status = ath_result['QueryExecution']['Status']['State']
        if query_status == 'FAILED' or query_status == 'CANCELLED':
            raise Exception('Athena query with the string "{}" failed or was cancelled'.format(query_string))
        time.sleep(10)

    s3_file = ath_result['QueryExecution']['ResultConfiguration']['OutputLocation']
        
    head, tail = os.path.split(s3_file)
        
    return tail

In [9]:
athena_client = session.client('athena')
s3_client = session.client('s3')
database = 'community_athena_demo_db'
s3_out_bucket = 'engineering'
s3_out_path = 'athena/output/'
s3_out = 's3://' + s3_out_bucket + '/' + s3_out_path
local_path = '/Users/mike/Documents/community/'
local_path = !pwd
local_path = local_path[0]

query = "select lh.loan_id, " \
    "lh.loan_amnt, lh.term, lh.int_rate, lh.installment, lh.grade, lh.sub_grade, " \
    "lh.emp_title, lh.emp_length, lh.home_ownership, lh.annual_inc, lh.verification_status,  " \
    "lh.pymnt_plan, lh.purpose, lh.title, lh.zip_code, lh.addr_state, lh.dti,  " \
    "lh.installment / (lh.annual_inc / 12) as mnthly_paymt_to_income_ratio, " \
    "lh.is_bad, " \
    "lc.delinq_2yrs, lc.earliest_cr_line, lc.inq_last_6mths, lc.mths_since_last_delinq, lc.mths_since_last_record, " \
    "lc.open_acc, lc.pub_rec, lc.revol_bal, lc.revol_util, lc.total_acc, lc.mths_since_last_major_derog " \
    "from community_athena_demo_db.loan_credit lc " \
    "join community_athena_demo_db.loan_history lh on lc.loan_id = lh.loan_id"

s3_file = fetch_athena_file(query, athena_client, database, s3_out, s3_client, local_path)

In [10]:
# get results file from S3
s3_client.download_file(s3_out_bucket, s3_out_path + s3_file, local_path + '/' + s3_file)

proj = dr.Project.create(local_path + '/' + s3_file,
    project_name='athena load file')

# further work with project via the python API, or work in GUI (link to project printed below)
print(DR_APP_ENDPOINT[:-7] + 'projects/{}'.format(proj.id))

https://app.datarobot.com/projects/04d0b2fa41


### Create Project from Signed S3 URL

In [11]:
response = s3_client.generate_presigned_url('get_object',
    Params={'Bucket': s3_out_bucket,
            'Key': s3_out_path + s3_file},
    ExpiresIn=3600)

In [12]:
proj = dr.Project.create(response,
    project_name='athena signed url')

# further work with project via the python API, or work in GUI (link to project printed below)
print(DR_APP_ENDPOINT[:-7] + 'projects/{}'.format(proj.id))

https://app.datarobot.com/projects/5ef04c2af1967
