# Introduction

This notebook demonstrates how to use Gretel.ai to generate synthetic data for a financial transactions dataset.

The generated data will maintain the statistical properties of the original data while preserving privacy.


## Prequisites

In [6]:
!pip install -U --quiet pandas gretel-client


In [4]:
!gretel configure

[33m
Gretel.ai COPYRIGHT Notice
[0m

The Gretel CLI and Python SDK, installed through the "gretel-client"
package or other mechanism is free and open source software under
the Apache 2.0 License.

When using the CLI or SDK, you may launch "Gretel Worker(s)"
that are hosted in your local environment as containers. These
workers are launched automatically when running commands that create
models or process data records.

The "Gretel Worker" and all code within it is copyrighted and an
extension of the Gretel Service and licensed under the Gretel.ai
Terms of Service.  These terms can be found at https://gretel.ai/terms
section G paragraph 2.



Endpoint [https://api.gretel.cloud]: 
Artifact Endpoint [cloud]: 
Default Runner (cloud, local, hybrid) [cloud]: 
Gretel API Key [None]: 
Default Project [none]: 
Using endpoint https://api.gretel.cloud
Logged in as ericphamhung@gmail.com ✅
[32mINFO: [0mConfiguration written to /root/.gretel/config.json. Done.
{
    "endpoint": "https://api.gre

In [5]:
!gretel whoami

{
    "email": "ericphamhung@gmail.com",
    "config": {
        "endpoint": "https://api.gretel.cloud",
        "artifact_endpoint": "cloud",
        "api_key": "grtu6836****",
        "default_project_name": null,
        "default_runner": "cloud",
        "preview_features": "disabled"
    }
}


# Main
This section covers the main steps of data loading, model training, data generation, and analysis.


In [7]:
# Import required libraries

import pandas as pd
from gretel_client import Gretel



In [8]:
# Load the transaction data from the previous notebook. This simulates the "real" data a customer would have.
df = pd.read_csv('transactions_merged.csv')
df.head()

  df_html=dataframe._repr_html_(),  # pylint: disable=protected-access
  df_html=dataframe._repr_html_(),  # pylint: disable=protected-access


Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount,account_balance
0,CUST_001,2022-01-01,100.0,credit,rent,1000.0,1100.0
1,CUST_001,2022-01-01,50.0,debit,groceries,1000.0,1050.0
2,CUST_001,2022-01-02,75.0,credit,salary,1000.0,1125.0
3,CUST_001,2022-01-02,25.0,debit,entertainment,1000.0,1100.0
4,CUST_001,2022-01-03,150.0,credit,investment,1000.0,1250.0


In [67]:
df[df['customer_id']=='CUST_055']

Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount,account_balance
281,CUST_055,2022-08-05,15000.0,debit,Purchase,750.0,-14250.0
874,CUST_055,2022-02-06,60.0,debit,subscription,750.0,-14310.0


In [9]:
# Prepare the training data
train_df = df
# train_df = df.sample(frac=0.8, random_state=42)
# test_df = df.drop(train_df.index)

In [10]:
# We are removing account balance here because it is a calculated column
# The intention is to derive the account balance values from the simulated transaction amounts
train_df.drop(columns=['account_balance'], inplace=True)
train_df

  df_html=dataframe._repr_html_(),  # pylint: disable=protected-access


Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount
0,CUST_001,2022-01-01,100.0,credit,rent,1000.0
1,CUST_001,2022-01-01,50.0,debit,groceries,1000.0
2,CUST_001,2022-01-02,75.0,credit,salary,1000.0
3,CUST_001,2022-01-02,25.0,debit,entertainment,1000.0
4,CUST_001,2022-01-03,150.0,credit,investment,1000.0
...,...,...,...,...,...,...
995,CUST_048,2023-01-17,1210.0,credit,salary,0.0
996,CUST_049,2023-01-18,1000.0,debit,groceries,0.0
997,CUST_049,2023-01-18,1230.0,credit,salary,0.0
998,CUST_050,2023-01-19,1020.0,debit,dining,0.0


In [46]:
# Load the model configuration

from gretel_client.projects.models import read_model_config

config_dict = read_model_config("synthetics/default")

config_dict



{'schema_version': '1.0',
 'name': 'tabular-lstm',
 'models': [{'synthetics': {'data_source': '__tmp__',
    'params': {'epochs': 'auto',
     'vocab_size': 'auto',
     'learning_rate': 'auto',
     'batch_size': 'auto',
     'rnn_units': 'auto'},
    'generate': {'num_records': 5000},
    'privacy_filters': {'outliers': 'auto', 'similarity': 'auto'}}}]}

In [47]:
import pandas as pd

from gretel_client import create_or_get_unique_project
from gretel_client import poll

train_df.columns = train_df.columns.astype(str)
proj = create_or_get_unique_project(name="gretel-assignment")


# Train model on "real" data
model = proj.create_model_obj(model_config="synthetics/tabular-lstm", data_source=train_df)

model.submit_cloud()


Model(id=6751f8ff0de2667a8fff96dc, project=proj_2pidXfwDmSfJeUhxqsVlaLv2mGm)

In [48]:
# Monitor the model training progress
poll(model)

INFO: Starting poller


{
    "uid": "6751f8ff0de2667a8fff96dc",
    "guid": "model_2poHFyGpScdh6HLLNoZMpKYFKXP",
    "model_name": "tabular-lstm",
    "model_key": "",
    "runner_mode": "cloud",
    "user_id": "674f25bb04ad3047687b5568",
    "user_guid": "user_2piDxsBzv9Bui42s1SGlF8FOQ2F",
    "billing_domain": "47b326e1cf4946efafc6298a7030aeba.gretel",
    "billing_domain_guid": "domain_2plpw9z74e3cBK2BtTVyERBueqt",
    "project_id": "674f5706002447fa52833677",
    "project_guid": "proj_2pidXfwDmSfJeUhxqsVlaLv2mGm",
    "cluster_guid": null,
    "status_history": {
        "created": "2024-12-05T19:03:27.819895Z"
    },
    "last_modified": "2024-12-05T19:03:28.043895Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "provenance": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:e09c09a1469d611d51739d4d8a8aa01c76b591ed1e05d

INFO: Status is created. Model creation has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin model creation.
INFO: Status is active. A worker has started creating your model!
2024-12-05T19:04:05.609246Z  Analyzing input data and checking for auto-params...
2024-12-05T19:04:05.627197Z  Found 5 auto-params that were set based on input data.
{
    "epochs": 100,
    "batch_size": 64,
    "vocab_size": 20000,
    "rnn_units": 256,
    "learning_rate": 0.01
}
2024-12-05T19:04:05.627798Z  Using updated model configuration: 
{
    "schema_version": "1.0",
    "name": "tabular-lstm",
    "models": [
        {
            "synthetics": {
                "privacy_filters": {
                    "outliers": "auto",
                    "similarity": "auto",
                    "max_iterations": 10
                },
                "data_source": [
                    "gretel_f53171e906a14c96a1b71948aa3a1b27_dataframe-56093101-621d-4d9c-ab8e-a02e0c061ac5.csv"

In [49]:
# Generate synthetic data
handler = model.create_record_handler_obj(params={"num_records": 1000})
handler.submit_cloud()

<gretel_client.projects.records.RecordHandler at 0x7c94081f5990>

In [50]:
# Monitor the data generation progress
poll(handler)

INFO: Starting poller


{
    "uid": "6751fb48c80c76731f505df5",
    "guid": "model_run_2poIRQjo7zvp3jbRbMbIPvsig9i",
    "model_name": null,
    "model_key": "",
    "runner_mode": "cloud",
    "user_id": "674f25bb04ad3047687b5568",
    "user_guid": "user_2piDxsBzv9Bui42s1SGlF8FOQ2F",
    "billing_domain": "47b326e1cf4946efafc6298a7030aeba.gretel",
    "billing_domain_guid": "domain_2plpw9z74e3cBK2BtTVyERBueqt",
    "project_id": "674f5706002447fa52833677",
    "project_guid": "proj_2pidXfwDmSfJeUhxqsVlaLv2mGm",
    "cluster_guid": null,
    "status_history": {
        "created": "2024-12-05T19:13:12.739000Z"
    },
    "last_modified": "2024-12-05T19:13:12.842000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "annotations": null,
    "provenance": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:e09c09a1469d611d51739d4d8a8aa01c76b591ed1e05deea50f

INFO: Status is created. A Record generation job has been queued.
INFO: Status is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
INFO: Status is active. A worker has started!
2024-12-05T19:13:29.007805Z  Loading model to worker
2024-12-05T19:13:47.496557Z  Checking for synthetic smart seeds
2024-12-05T19:13:47.496938Z  No smart seeds provided, will attempt generation without them
2024-12-05T19:13:47.497189Z  Loading model
2024-12-05T19:13:47.497557Z  Fallback model is available to use if needed.
2024-12-05T19:13:53.737434Z  LSTM model is available for generation.
2024-12-05T19:13:53.738228Z  Generating records...
{
    "num_records": 1000
}
2024-12-05T19:13:53.739549Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2024-12-05T19:13:58.745663Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_co

In [51]:
# Download the generated data
handler.download_artifacts("more_syn_data/")

In [52]:
# Unzip the downloaded data file
!gzip -d more_syn_data/data.gz

gzip: more_syn_data/data already exists; do you wish to overwrite (y or n)? y


In [53]:
# Load the synthetic data into a pandas DataFrame
synth_df = pd.read_csv("more_syn_data/data")

In [54]:
synth_df[synth_df['customer_id']=='CUST_001']

Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount
74,CUST_001,2022-01-04,150.0,credit,loan,1000.0
75,CUST_001,2022-01-01,980.25,credit,deposit,1000.0
90,CUST_001,2022-01-01,80.0,credit,loan,1000.0
113,CUST_001,2022-01-02,25.0,credit,salary,1000.0
123,CUST_001,2022-01-02,300.0,debit,entertainment,1000.0
153,CUST_001,2022-01-04,30.0,credit,housing,1000.0
184,CUST_001,2022-01-02,25.0,credit,salary,1000.0
191,CUST_001,2022-01-01,80.0,credit,loan,1000.0
195,CUST_001,2022-01-03,300.0,credit,entertainment,1000.0
225,CUST_001,2022-01-01,50.0,credit,sports,1000.0


In [55]:
# Analyze the distribution of customer IDs in the synthetic data
synth_df['customer_id'].value_counts()

Unnamed: 0,customer_id
CUST_004,51
CUST_009,47
CUST_014,44
CUST_013,44
CUST_006,42
...,...
CUST_060,1
CUST_052,1
CUST_069,1
CUST_071,1


In [56]:
# Compare with the distribution in the original data
train_df['customer_id'].value_counts()

Unnamed: 0,customer_id
CUST_002,43
CUST_001,41
CUST_005,38
CUST_009,38
CUST_006,37
...,...
CUST_059,1
CUST_058,1
CUST_057,1
CUST_056,1


In [59]:
# Define a function to compute the account balance
def compute_balance(row, balances):
    previous_balance = balances.get(row['customer_id'], row['starting_amount'])
    transaction_effect = row['transaction_amount'] if row['transaction_type'] == "credit" else -row['transaction_amount']
    current_balance = previous_balance + transaction_effect
    balances[row['customer_id']] = current_balance
    return current_balance

# Compute the account balance for each transaction
balances = {}
synth_df['account_balance'] = synth_df.apply(compute_balance, axis=1, balances=balances)


In [60]:
synth_df

Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount,account_balance
0,CUST_012,2022-01-05,120.00,debit,rent,750.0,630.00
1,CUST_040,2023-01-09,500.00,debit,Food,0.0,-500.00
2,CUST_030,2022-02-01,1700.00,credit,housing,0.0,1700.00
3,CUST_012,2022-01-12,350.00,credit,investment,750.0,980.00
4,CUST_015,2022-02-26,590.00,debit,salary,450.0,-140.00
...,...,...,...,...,...,...,...
995,CUST_014,2022-03-17,25.00,debit,groceries,100.0,25930.00
996,CUST_012,2022-03-10,35.99,credit,bonus,750.0,30860.99
997,CUST_009,2022-01-28,1200.00,credit,rent,700.0,14785.99
998,CUST_004,2022-01-21,40.00,credit,rent,1200.0,6487.97


In [63]:
synth_df[synth_df['customer_id']=='CUST_012']

Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount,account_balance
0,CUST_012,2022-01-05,120.0,debit,rent,750.0,630.0
3,CUST_012,2022-01-12,350.0,credit,investment,750.0,980.0
25,CUST_012,2022-01-10,9600.0,credit,Deposit,750.0,10580.0
79,CUST_012,2022-01-12,850.0,credit,Shopping,750.0,11430.0
95,CUST_012,2022-01-12,850.0,credit,Shopping,750.0,12280.0
104,CUST_012,2022-02-14,450.0,credit,rent,750.0,12730.0
137,CUST_012,2022-03-07,25.99,debit,groceries,750.0,12704.01
142,CUST_012,2022-01-03,90.0,credit,housing,750.0,12794.01
146,CUST_012,2022-01-13,130.0,debit,subscription,750.0,12664.01
151,CUST_012,2022-01-12,700.0,credit,Rent,750.0,13364.01


In [68]:
df[df['customer_id']=='CUST_012']

Unnamed: 0,customer_id,transaction_date,transaction_amount,transaction_type,transaction_category,starting_amount,account_balance
76,CUST_012,2022-02-14,520.0,credit,rent,750.0,1270.0
77,CUST_012,2022-02-14,440.0,debit,salary,750.0,830.0
78,CUST_012,2022-02-15,530.0,credit,groceries,750.0,1360.0
79,CUST_012,2022-02-15,470.0,debit,entertainment,750.0,890.0
80,CUST_012,2022-02-16,540.0,credit,investment,750.0,1430.0
81,CUST_012,2022-02-16,470.0,debit,transportation,750.0,960.0
82,CUST_012,2022-02-17,550.0,debit,loan,750.0,410.0
159,CUST_012,2022-01-03,1500.0,credit,salary,750.0,1910.0
160,CUST_012,2022-01-03,50.0,debit,groceries,750.0,1860.0
161,CUST_012,2022-01-04,900.0,credit,gift,750.0,2760.0


In [62]:
# Save Output
synth_df.to_csv('synthetic_data.csv', index=False)