In [None]:
from sdv.tabular import CTGAN


In [None]:
import sdv
import pandas as pd

In [None]:
data = pd.read_csv('data/loan_defaults.csv')

In [None]:
data.head()

In [None]:
from sdv.constraints import Unique, Inequality, create_custom_constraint

In [None]:
# num dias a trabalhar > num dias vivo
age_gt_age_when_joined_constraint = Inequality(
    low_column_name='DAYS_BIRTH',
    high_column_name='DAYS_EMPLOYED'
    )

In [None]:
def no_children_working_is_valid(column_names, data, working_age_years):
    # assumes column_names[0] has column of DAYS_BIRTH
    # assumes column_names[1] has column of DAYS_EMPLOYED
    days_birth = data[column_names[0]]
    days_employed = data[column_names[1]]
    
    days_employed_working_age = -working_age_years * 365
    
    return (days_birth - days_employed) <  days_employed_working_age


NoChildWorkerConstraint = create_custom_constraint(
    is_valid_fn=no_children_working_is_valid
)

In [None]:
def no_unemployed_work_phones_is_valid(column_names, data):
    # assumes column_names[0] has column of DAYS_EMPLOYED
    # assumes column_names[1] has column of FLAG_WORK_PHONE
    unemployed = data[column_names[0]] > 0
    flag_work_phone = data[column_names[1]] == 1
    
    return ~(unemployed & flag_work_phone)


NoUmemployedWorkPhoneConstraint = create_custom_constraint(
    is_valid_fn=no_unemployed_work_phones_is_valid
)

In [None]:
def no_col1_gt_col2_not_match_is_valid(column_names, data, gt_val, match_val):
    ''' this constraint makes sure that all rows that have 
            column 0 over gt_val AND
            col 1 with a value different from match_val
        don't exist
    '''
    col1_gt = data[column_names[0]] > gt_val
    col2_match = data[column_names[1]] != match_val
    
    return ~(col1_gt & col2_match)


NotColumn1GtColumn2NotMatchConstraint = create_custom_constraint(
    is_valid_fn=no_col1_gt_col2_not_match_is_valid
)

In [None]:
def no_col1_gt_col2_match_is_valid(column_names, data, gt_val, match_val):
    ''' this constraint makes sure that all rows that have 
            column 0 over gt_val AND
            col 1 with a value equal to match_val
        don't exist
    '''
    col1_gt = data[column_names[0]] > gt_val
    col2_match = data[column_names[1]] == match_val
    
    return ~(col1_gt & col2_match)


NotColumn1GtColumn2MatchConstraint = create_custom_constraint(
    is_valid_fn=no_col1_gt_col2_match_is_valid
)

In [None]:
no_clients_worked_before_age_X = NoChildWorkerConstraint(column_names=['DAYS_BIRTH', 'DAYS_EMPLOYED'], working_age_years=16)
positive_days_employed_dont_have_work_phones = NotColumn1GtColumn2MatchConstraint(column_names=['DAYS_EMPLOYED', 'FLAG_WORK_PHONE'], gt_val=0, match_val=1)
positive_days_employed_are_pensioners = NotColumn1GtColumn2NotMatchConstraint(column_names=['DAYS_EMPLOYED', 'NAME_INCOME_TYPE'], gt_val=0, match_val='Pensioner')

In [None]:
constraints = [
    age_gt_age_when_joined_constraint,
    no_clients_worked_before_age_X,
#    NoUmemployedWorkPhoneConstraint(column_names=['DAYS_EMPLOYED', 'FLAG_WORK_PHONE']),
    positive_days_employed_dont_have_work_phones,
    positive_days_employed_are_pensioners,
]

In [None]:
model = CTGAN(primary_key='ID',
             constraints=constraints)

In [None]:
model.fit(data.iloc[:500])

In [None]:
x = model.sample(1000)

In [None]:
x.head()