In [9]:
# imports
import os
import math
#import tensorflow as tf
#from tensorflow import keras
import numpy as np
from numpy import genfromtxt
#print(tf.__version__)

In [14]:
# data
dtypes = {
    'fraud_bool': '?',
    'income': 'f4',
    'name_email_similarity': 'f2',
    'prev_address_months_count': 'i2',
    'current_address_months_count': 'i2',
    'customer_age': 'u1',
    'days_since_request': 'f2',
    'intended_balcon_amount': 'i2',
    'payment_type': 'u1', # STR
    'zip_count_4w': 'u2',
    'velocity_6h': 'i4',
    'velocity_24h': 'u2',
    'velocity_4w': 'u2',
    'bank_branch_count_8w': 'u2',
    'date_of_birth_distinct_emails_4w': 'u1',
    'employment_status': 'u2',
    'credit_risk_score': 'i2',
    'email_is_free': '?',
    'housing_status': 'u1', # STR
    'phone_home_valid': '?',
    'phone_mobile_valid': '?',
    'bank_months_count': 'i1',
    'has_other_cards': '?',
    'proposed_credit_limit':
    'u2', 'foreign_request': '?',
    'source': 'u1', # STR
    'session_length_in_minutes': 'i1',
    'device_os': 'u1', # STR
    'keep_alive_session': '?',
    'device_distinct_emails_8w': 'i1',
    'device_fraud_count': '?',
    'month': 'u1'}

def strconv(cat):
    seen = {}
    def inner(s):
        nonlocal seen
        if s not in seen:
            seen[s] = len(seen)
        return seen[s]
    return inner

strconvs = {
    'source': strconv("source"),
    'device_os': strconv("device_os"),
    'housing_status': strconv("housing_status"),
    'payment_type': strconv("payment_type"),
    'employment_status': strconv("employment_status")
}

column_labels = list(next(open('Base.csv')).strip().split(','))
converters = {0: lambda s: bool(int(s))}
for k, v in strconvs.items():
    converters[column_labels.index(k)] = v
rawdata = genfromtxt('Base.csv', dtype=[dtypes[n] for n in column_labels],
                  names=column_labels, delimiter=',', converters=converters)[1:] # skip header
txtdata = list(open('Base.csv').read().split('\n'))[1:]
print(f'Example row: {rawdata[0]} : {type(rawdata[0])}')
print(f'Computed type of elements: {rawdata.dtype}')
print(f'Same row but unparsed, does it match? {txtdata[0]})')
issue = False
anyissue = False
for rowix, (txtrow, rawrow) in enumerate(zip(txtdata, rawdata)):
    for ix, c in enumerate(rawrow):
        if ix == 4 and c == 65535:
            continue # this is the one column where -1 as missing is OK
        if type(c) != np.str_ and np.isnan(c):
            print(f'{rawrow} has a nan at {ix}!')
        elif np.issubdtype(type(c), np.integer):
            info = np.iinfo(type(c))
            if (c == info.min or c == info.max) and c!=0:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the irange of {type(c)}!')
        elif np.issubdtype(type(c), np.floating):
            info = np.finfo(type(c))
            if c == info.min or c == info.max:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the frange of {type(c)}!')
    if issue:
        print(f'row was: {rawrow}')
        print(f'source data was: {txtrow}')
        anyissue = True
        issue = False
else:
    if not anyissue:
        print('No issues found with datatype!')
    else:
        print('Issues found with datatype! Look closely!')
is_fraud_ground_truth = [row[0] for row in rawdata]
data = np.array([np.array(row)[1:] for row in rawdata]) # skip the fraud bool

Example row: (True, 0.9, 0.1669, -1, 88, 50, 0.02092, -1, 1, 769, 10650, 3134, 3863, 1, 6, 1, 185, False, 1, False, False, 24, False, 500, False, 1, 3, 1, False, 1, False, 7) : <class 'numpy.void'>
Computed type of elements: [('fraud_bool', '?'), ('income', '<f4'), ('name_email_similarity', '<f2'), ('prev_address_months_count', '<i2'), ('current_address_months_count', '<i2'), ('customer_age', 'u1'), ('days_since_request', '<f2'), ('intended_balcon_amount', '<i2'), ('payment_type', 'u1'), ('zip_count_4w', '<u2'), ('velocity_6h', '<i4'), ('velocity_24h', '<u2'), ('velocity_4w', '<u2'), ('bank_branch_count_8w', '<u2'), ('date_of_birth_distinct_emails_4w', 'u1'), ('employment_status', '<u2'), ('credit_risk_score', '<i2'), ('email_is_free', '?'), ('housing_status', 'u1'), ('phone_home_valid', '?'), ('phone_mobile_valid', '?'), ('bank_months_count', 'i1'), ('has_other_cards', '?'), ('proposed_credit_limit', '<u2'), ('foreign_request', '?'), ('source', 'u1'), ('session_length_in_minutes', 'i1

In [None]:
real_dataset = data
training_size = math.floor(len(real_dataset)*.2)
training_dataset = real_dataset[:training_size,:]
evaluation_dataset = real_dataset[training_size:,:]
evaluation_dataset_labels = is_fraud_ground_truth[training_size:]
is_fraud_ground_truth_training = is_fraud_ground_truth[:training_size]

6200000
31000000
200000
200000
24800000


In [None]:
print(column_labels)

['fraud_bool', 'income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'employment_status', 'credit_risk_score', 'email_is_free', 'housing_status', 'phone_home_valid', 'phone_mobile_valid', 'bank_months_count', 'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source', 'session_length_in_minutes', 'device_os', 'keep_alive_session', 'device_distinct_emails_8w', 'device_fraud_count', 'month\n']


In [None]:
print(real_dataset[0:2])

[[ 9.00000000e-01  1.66827734e-01 -1.00000000e+00  8.80000000e+01
   5.00000000e+01  2.09251728e-02 -1.33134496e+00             nan
   7.69000000e+02  1.06507655e+04  3.13431963e+03  3.86364774e+03
   1.00000000e+00  6.00000000e+00             nan  1.85000000e+02
   0.00000000e+00             nan  1.00000000e+00  0.00000000e+00
   2.40000000e+01  0.00000000e+00  5.00000000e+02  0.00000000e+00
              nan  3.88811460e+00             nan  0.00000000e+00
   1.00000000e+00  0.00000000e+00  7.00000000e+00]
 [ 9.00000000e-01  2.96286005e-01 -1.00000000e+00  1.44000000e+02
   5.00000000e+01  5.41753833e-03 -8.16223755e-01             nan
   3.66000000e+02  5.34047319e+02  2.67091829e+03  3.12429817e+03
   7.18000000e+02  3.00000000e+00             nan  2.59000000e+02
   1.00000000e+00             nan  0.00000000e+00  0.00000000e+00
   1.50000000e+01  0.00000000e+00  1.50000000e+03  0.00000000e+00
              nan  3.17988194e+01             nan  0.00000000e+00
   1.00000000e+00  0.0000

In [None]:
#Model old
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(2)
])

Metal device set to: Apple M1


2022-12-06 18:01:31.502547: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-06 18:01:31.502634: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
#compile
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
#fit
history = model.fit(training_dataset, is_fraud_ground_truth_training, epochs=1, validation_split=0.1)

2022-12-06 18:01:31.628887: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-06 18:01:31.825069: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-06 18:03:35.081177: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.





In [None]:
history = model.evaluate(evaluation_dataset, evaluation_dataset_labels)

  numdigits = int(np.log10(self.target)) + 1


OverflowError: cannot convert float infinity to integer