In [1]:
# Imports
import numpy as np
import csv 
from random import seed
from random import random
from tqdm import tqdm

In [2]:
# Configure some parameters
num_data_points = 100000
percentage_fraud = 0.01
error_in_percentage = 0.05
outputFile = "credit2.csv"

In [3]:
# Configure some ranges 
max_remaining_credit = 100000
max_payment = 5000 
max_interest_rate = 0.05
seed(23)

In [4]:
# Synthetic data set: repayment of a credit 
# attributes 
# recent_credit, interest_rate, payment, remaining_credit
# equation: remaining_credit = recent_credit + recent_credit * interest_rate - payment 
# interest rate / 12 --> monthly rate 

# Open the output file 

with open(outputFile, 'w') as o_file:
    writer = csv.writer(o_file)
    writer.writerow(["lastCred", "interest", "payment","remCred", "isFraud"])
    # Create n data points 
    for i in tqdm(range(0,num_data_points)): 
        successful = False 
        while successful == False: 
            rc = round( random() * max_remaining_credit , 2)
            ir = round( random() * max_interest_rate , 2)
            p =  round( random() * max_payment ,2 )
            
            if (rc + rc*(ir/12.0) - p) > 0 and (rc*(ir/12.0) < p):
                successful = True 
        
        # Calc attributes 
        attr_recent_credit = rc
        attr_interest_rate = ir 
        attr_payment = p 
        attr_remaining_credit = rc + rc*(ir/12.0) - p 
        
        # Fraud? 
        attr_fraud = 0
        if random() < percentage_fraud: 
            error = attr_payment * np.random.uniform(0.01, 0.1) #error_in_percentage
            attr_remaining_credit = attr_remaining_credit + error 
            attr_fraud = 1 
        
        # Write 
        attr_remaining_credit =  round( attr_remaining_credit ,2 )
        #print(attr_recent_credit,attr_interest_rate,attr_payment,attr_remaining_credit,attr_fraud)
        writer.writerow([attr_recent_credit,attr_interest_rate,attr_payment,attr_remaining_credit,attr_fraud])
        

100%|██████████| 100000/100000 [00:00<00:00, 100663.96it/s]


In [5]:
import pandas as pd

df = pd.read_csv("output.csv")

df["isFraud"].value_counts()


0    98967
1     1033
Name: isFraud, dtype: int64