In [1]:
from google.cloud import bigquery

In [2]:
import yaml
with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    table_header = config['project']['project_id']+"."+config['databases']['dataset']+"."

In [3]:
client = bigquery.Client()

In [10]:
pipeline_uri = '0c57e63e49134502a0b2813c9a0e6d49'

In [6]:
table_id = table_header+config['databases']['ml_baseline']+pipeline_uri
display_name = "elliptic_baseline_training_dataset"
bq_source = "bq://katana-clusters-beta.fsi_elliptic.auto_ml_baseline_training"

In [30]:
mapping_table = table_header+config['databases']['account_mapping']+pipeline_uri
features = table_header+config['databases']['account_features']
embeddings = table_header+config['databases']['embeddings']+pipeline_uri

In [27]:
sql = f"""
SELECT af.*, am.split, am.target
FROM 
    {mapping_table} am
JOIN {features} af
on am.account_id = af.account_id
LIMIT 10"""

df = client.query(sql).to_dataframe()

In [33]:
sql_enhanced = f"""
SELECT af.*, am.split, am.target, em.* 
FROM 
    {mapping_table} am
JOIN {features} af
on am.account_id = af.account_id
join {embeddings} em 
on am.account_gid = em.account_gid
LIMIT 10"""

df = client.query(sql_enhanced).to_dataframe()

In [34]:
df.head()

Unnamed: 0,account_id,local_feat_2,local_feat_3,local_feat_4,local_feat_5,local_feat_6,local_feat_7,local_feat_8,local_feat_9,local_feat_10,...,embed_7,embed_8,embed_9,embed_10,embed_11,embed_12,embed_13,embed_14,embed_15,event_timestamp_1
0,232061267,-0.172216,-0.184668,-1.201369,0.028105,-0.024025,0.054722,-0.061584,-0.163639,-0.168723,...,0.954046,0.0,0.015782,0.0,0.0,0.0,0.0,0.381836,0.0,2022-09-23 00:00:00+00:00
1,116792090,0.455667,-0.043449,1.018602,0.17818,0.095076,0.222447,-0.061584,-0.163508,0.196333,...,0.084079,0.0,0.0,0.299205,0.0,0.0,0.065872,0.0,0.0,2022-09-23 00:00:00+00:00
2,87125434,-0.146885,-0.184668,-1.201369,-0.12197,0.055376,-0.113002,-0.061584,-0.136943,-0.143132,...,0.96944,0.0,0.127611,0.0,0.0,0.0,0.176746,0.506385,0.0,2022-09-23 00:00:00+00:00
3,94478361,-0.172873,-0.190531,0.463609,-0.12197,-0.024025,-0.113002,-0.061584,-0.163534,-0.16935,...,0.0,0.0,0.039523,0.778526,0.0,0.0,1.124122,0.0,0.0,2022-09-23 00:00:00+00:00
4,21849470,1.33063,-0.190168,0.463609,-0.12197,-0.004174,-0.113002,-0.061584,1.374772,1.347396,...,0.172755,0.0,0.709343,0.425316,0.0,0.0,0.764778,0.0,0.0,2022-09-23 00:00:00+00:00


In [35]:
# set train / test split column
df['split_str'] = df['split'].map({0: "TRAIN", 1:"VALIDATE", 2: "TEST"})
# drop columns
df = df.drop(columns=["split", "account_id", "event_timestamp", "event_timestamp_1"])

In [36]:
df.head()

Unnamed: 0,local_feat_2,local_feat_3,local_feat_4,local_feat_5,local_feat_6,local_feat_7,local_feat_8,local_feat_9,local_feat_10,local_feat_11,...,embed_7,embed_8,embed_9,embed_10,embed_11,embed_12,embed_13,embed_14,embed_15,split_str
0,-0.172216,-0.184668,-1.201369,0.028105,-0.024025,0.054722,-0.061584,-0.163639,-0.168723,-0.044339,...,0.954046,0.0,0.015782,0.0,0.0,0.0,0.0,0.381836,0.0,TRAIN
1,0.455667,-0.043449,1.018602,0.17818,0.095076,0.222447,-0.061584,-0.163508,0.196333,2.171767,...,0.084079,0.0,0.0,0.299205,0.0,0.0,0.065872,0.0,0.0,TEST
2,-0.146885,-0.184668,-1.201369,-0.12197,0.055376,-0.113002,-0.061584,-0.136943,-0.143132,-0.049707,...,0.96944,0.0,0.127611,0.0,0.0,0.0,0.176746,0.506385,0.0,TRAIN
3,-0.172873,-0.190531,0.463609,-0.12197,-0.024025,-0.113002,-0.061584,-0.163534,-0.16935,-0.049707,...,0.0,0.0,0.039523,0.778526,0.0,0.0,1.124122,0.0,0.0,VALIDATE
4,1.33063,-0.190168,0.463609,-0.12197,-0.004174,-0.113002,-0.061584,1.374772,1.347396,-0.049707,...,0.172755,0.0,0.709343,0.425316,0.0,0.0,0.764778,0.0,0.0,TRAIN


In [37]:
df.columns

Index(['local_feat_2', 'local_feat_3', 'local_feat_4', 'local_feat_5',
       'local_feat_6', 'local_feat_7', 'local_feat_8', 'local_feat_9',
       'local_feat_10', 'local_feat_11',
       ...
       'embed_7', 'embed_8', 'embed_9', 'embed_10', 'embed_11', 'embed_12',
       'embed_13', 'embed_14', 'embed_15', 'split_str'],
      dtype='object', length=112)