In [80]:
# imports
import os
import math
import tensorflow as tf
from tensorflow import keras
import numpy as np
from numpy import genfromtxt
from numpy.lib import recfunctions as rfn
#print(tf.__version__)

In [81]:
# data
dtypes = {
    'fraud_bool': '?',
    'income': 'f4',
    'name_email_similarity': 'f2',
    'prev_address_months_count': 'i2',
    'current_address_months_count': 'i2',
    'customer_age': 'u1',
    'days_since_request': 'f2',
    'intended_balcon_amount': 'i2',
    'payment_type': 'u1', # STR
    'zip_count_4w': 'u2',
    'velocity_6h': 'i4',
    'velocity_24h': 'u2',
    'velocity_4w': 'u2',
    'bank_branch_count_8w': 'u2',
    'date_of_birth_distinct_emails_4w': 'u1',
    'employment_status': 'u2',
    'credit_risk_score': 'i2',
    'email_is_free': 'u1',
    'housing_status': 'u1', # STR
    'phone_home_valid': 'u1',
    'phone_mobile_valid': 'u1',
    'bank_months_count': 'i1',
    'has_other_cards': 'u1',
    'proposed_credit_limit':
    'u2', 'foreign_request': 'u1',
    'source': 'u1', # STR
    'session_length_in_minutes': 'i1',
    'device_os': 'u1', # STR
    'keep_alive_session': 'u1',
    'device_distinct_emails_8w': 'i1',
    'device_fraud_count': 'u1',
    'month': 'u1'}

def strconv(cat):
    seen = {}
    def inner(s):
        nonlocal seen
        if s not in seen:
            seen[s] = len(seen)
        return seen[s]
    return inner

strconvs = {
    'source': strconv("source"),
    'device_os': strconv("device_os"),
    'housing_status': strconv("housing_status"),
    'payment_type': strconv("payment_type"),
    'employment_status': strconv("employment_status")
}

column_labels = list(next(open('Base.csv')).strip().split(','))
converters = {0: lambda s: bool(int(s))}
for k, v in strconvs.items():
    converters[column_labels.index(k)] = v
rawdata = genfromtxt('Base.csv', dtype=[dtypes[n] for n in column_labels],
                  names=column_labels, delimiter=',', converters=converters)[1:] # skip header
txtdata = list(open('Base.csv').read().split('\n'))[1:]
print(f'Example row: {rawdata[0]} : {type(rawdata[0])}')
print(f'Computed type of elements: {rawdata.dtype}')
print(f'Same row but unparsed, does it match? {txtdata[0]})')
issue = False
anyissue = False
for rowix, (txtrow, rawrow) in enumerate(zip(txtdata, rawdata)):
    if not (rawrow[0] == 0 or rawrow[0] == 1):
        print(f'WARN: fraud_bool for row {rowix} is {rawrow[0]}, not 0 or 1!')
        anyissue = True

    for ix, c in enumerate(rawrow):
        if ix == 4 and c == 65535:
            continue # this is the one column where -1 as missing is OK
        if type(c) != np.str_ and np.isnan(c):
            print(f'{rawrow} has a nan at {ix}!')
        elif np.issubdtype(type(c), np.integer):
            info = np.iinfo(type(c))
            if (c == info.min or c == info.max) and c!=0:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the irange of {type(c)}!')
        elif np.issubdtype(type(c), np.floating):
            info = np.finfo(type(c))
            if c == info.min or c == info.max:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the frange of {type(c)}!')
    if issue:
        print(f'row was: {rawrow}')
        print(f'source data was: {txtrow}')
        anyissue = True
        issue = False
else:
    if not anyissue:
        print('No issues found with datatype!')
    else:
        print('Issues found with datatype! Look closely!')
data = rfn.structured_to_unstructured(rawdata, dtype=np.float32)


In [None]:
is_fraud_ground_truth = np.int_(data[:,:1])
real_dataset = data[:,1:]  # skip the fraud bool
#np.array([np.array(row)[1:] for row in rawdata]) # skip the fraud bool
training_size = math.floor(len(real_dataset)*.2)
training_dataset = real_dataset[:training_size]
evaluation_dataset = real_dataset[training_size:]
evaluation_dataset_labels = is_fraud_ground_truth[training_size:]
is_fraud_ground_truth_training = is_fraud_ground_truth[:training_size]


In [None]:
def dbg_ndarray(val):
    print(type(val))
    print(type(val.dtype))
    print(val.shape)
dbg_ndarray(is_fraud_ground_truth)
dbg_ndarray(is_fraud_ground_truth_training)
print(training_dataset[3:10])
dbg_ndarray(training_dataset)

<class 'numpy.ndarray'>
<class 'numpy.dtype[int64]'>
(1000000, 1)
<class 'numpy.ndarray'>
<class 'numpy.dtype[int64]'>
(200000, 1)
[[ 8.9999998e-01  1.5954590e-01 -1.0000000e+00  2.2000000e+01
   5.0000000e+01  1.9073486e-02 -1.0000000e+00  2.0000000e+00
   8.1000000e+02  3.4570000e+03  4.0540000e+03  3.0220000e+03
   1.9210000e+03  6.0000000e+00  1.0000000e+00  1.1000000e+02
   1.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
   3.1000000e+01  1.0000000e+00  2.0000000e+02  0.0000000e+00
   1.0000000e+00  2.0000000e+00  3.0000000e+00  0.0000000e+00
   1.0000000e+00  0.0000000e+00  7.0000000e+00]
 [ 8.9999998e-01  5.9619141e-01 -1.0000000e+00  2.1800000e+02
   5.0000000e+01  4.4403076e-03  0.0000000e+00  2.0000000e+00
   8.9000000e+02  5.0200000e+03  2.7280000e+03  3.0870000e+03
   1.9900000e+03  2.0000000e+00  1.0000000e+00  2.9500000e+02
   1.0000000e+00  1.0000000e+00  1.0000000e+00  0.0000000e+00
   3.1000000e+01  0.0000000e+00  1.5000000e+03  0.0000000e+00
   1.0000000e+0

In [None]:
print(real_dataset[0:2])

[[ 8.9999998e-01  1.6687012e-01 -1.0000000e+00  8.8000000e+01
   5.0000000e+01  2.0919800e-02 -1.0000000e+00  1.0000000e+00
   7.6900000e+02  1.0650000e+04  3.1340000e+03  3.8630000e+03
   1.0000000e+00  6.0000000e+00  1.0000000e+00  1.8500000e+02
   0.0000000e+00  1.0000000e+00  1.0000000e+00  0.0000000e+00
   2.4000000e+01  0.0000000e+00  5.0000000e+02  0.0000000e+00
   1.0000000e+00  3.0000000e+00  1.0000000e+00  0.0000000e+00
   1.0000000e+00  0.0000000e+00  7.0000000e+00]
 [ 8.9999998e-01  2.9638672e-01 -1.0000000e+00  1.4400000e+02
   5.0000000e+01  5.4168701e-03  0.0000000e+00  2.0000000e+00
   3.6600000e+02  5.3400000e+02  2.6700000e+03  3.1240000e+03
   7.1800000e+02  3.0000000e+00  1.0000000e+00  2.5900000e+02
   1.0000000e+00  1.0000000e+00  0.0000000e+00  0.0000000e+00
   1.5000000e+01  0.0000000e+00  1.5000000e+03  0.0000000e+00
   1.0000000e+00  3.1000000e+01  1.0000000e+00  0.0000000e+00
   1.0000000e+00  0.0000000e+00  7.0000000e+00]]


In [None]:
#Model old
model: keras.Model = keras.Sequential([
    keras.layers.Dense((32**2) * 2, activation='relu'),
    keras.layers.Dense(32**2, activation='relu'),
    keras.layers.Dense((32**2)/4, activation='relu'),
    keras.layers.Dense((32**2)/8, activation='relu'),
    keras.layers.Dense((32**2)/16, activation='relu'),
    keras.layers.Dense(2)
])

In [None]:
#compile
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])


In [None]:
#fit
history = model.fit(x=training_dataset, y=is_fraud_ground_truth_training, epochs=25, validation_split=0.1)

Epoch 1/2


2022-12-11 00:28:51.022606: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-11 00:29:24.653135: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


In [None]:
history = model.evaluate(evaluation_dataset, evaluation_dataset_labels)

