In [2]:
# imports
import os
import math
import tensorflow as tf
from tensorflow import keras
import numpy as np
from numpy import genfromtxt
from numpy.lib import recfunctions as rfn
#print(tf.__version__)

In [None]:
# colab config
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/My Drive/cs470')

In [3]:
# data
dtypes = {
    'fraud_bool': '?',
    'income': 'f4',
    'name_email_similarity': 'f2',
    'prev_address_months_count': 'i2',
    'current_address_months_count': 'i2',
    'customer_age': 'u1',
    'days_since_request': 'f2',
    'intended_balcon_amount': 'f2',
    'payment_type': 'u1', # STR
    'zip_count_4w': 'u2',
    'velocity_6h': 'f4',
    'velocity_24h': 'f4',
    'velocity_4w': 'f4',
    'bank_branch_count_8w': 'u2',
    'date_of_birth_distinct_emails_4w': 'u1',
    'employment_status': 'u2',
    'credit_risk_score': 'i2',
    'email_is_free': 'u1',
    'housing_status': 'u1', # STR
    'phone_home_valid': 'u1',
    'phone_mobile_valid': 'u1',
    'bank_months_count': 'i1',
    'has_other_cards': 'u1',
    'proposed_credit_limit':
    'f2', 'foreign_request': 'u1',
    'source': 'u1', # STR
    'session_length_in_minutes': 'f4',
    'device_os': 'u1', # STR
    'keep_alive_session': 'u1',
    'device_distinct_emails_8w': 'i1',
    'device_fraud_count': 'u1',
    'month': 'u1'}

def strconv(cat):
    """Returns a function that maps its input string to the same small number each time."""
    seen = {}
    def inner(s):
        nonlocal seen
        if s not in seen:
            seen[s] = len(seen)
        return seen[s]
    return inner

strconvs = {
    'source': strconv("source"),
    'device_os': strconv("device_os"),
    'housing_status': strconv("housing_status"),
    'payment_type': strconv("payment_type"),
    'employment_status': strconv("employment_status")
}

column_labels = list(next(open('Base.csv')).strip().split(','))
converters = {0: lambda s: bool(int(s))}
for k, v in strconvs.items():
    converters[column_labels.index(k)] = v
rawdata = genfromtxt('Base.csv', dtype=[dtypes[n] for n in column_labels],
                  names=column_labels, delimiter=',', converters=converters)[1:] # skip header
txtdata = list(open('Base.csv').read().split('\n'))[1:]
print(f'Example row: {rawdata[0]} : {type(rawdata[0])}')
print(f'Computed type of elements: {rawdata.dtype}')
print(f'Same row but unparsed, does it match? {txtdata[0]})')
issue = False
anyissue = False
for rowix, (txtrow, rawrow) in enumerate(zip(txtdata, rawdata)):
    if not (rawrow[0] == 0 or rawrow[0] == 1):
        print(f'WARN: fraud_bool for row {rowix} is {rawrow[0]}, not 0 or 1!')
        anyissue = True

    for ix, c in enumerate(rawrow):
        if ix == 4 and c == 65535:
            continue # this is the one column where -1 as missing is OK
        if type(c) != np.str_ and np.isnan(c):
            print(f'{rawrow} has a nan at {ix}!')
        elif np.issubdtype(type(c), np.integer):
            info = np.iinfo(type(c))
            if (c == info.min or c == info.max) and c!=0:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the irange of {type(c)}!')
        elif np.issubdtype(type(c), np.floating):
            info = np.finfo(type(c))
            if c == info.min or c == info.max:
                issue = True
                print(f'WARN: column {ix} row {rowix} ({column_labels[ix]}) has maxed out the frange of {type(c)}!')
    if issue:
        print(f'row was: {rawrow}')
        print(f'source data was: {txtrow}')
        anyissue = True
        issue = False
else:
    if not anyissue:
        print('No issues found with datatype!')
    else:
        print('Issues found with datatype! Look closely!')
data = rfn.structured_to_unstructured(rawdata, dtype=np.float32)
print(f'Unstructured data: {data[0]} : {type(data[0])}')

Example row: (True, 0.9, 0.1669, -1, 88, 50, 0.02092, -1.331, 1, 769, 10650.766, 3134.3196, 3863.6477, 1, 6, 1, 185, 0, 1, 1, 0, 24, 0, 500., 0, 1, 3.8881147, 1, 0, 1, 0, 7) : <class 'numpy.void'>
Computed type of elements: [('fraud_bool', '?'), ('income', '<f4'), ('name_email_similarity', '<f2'), ('prev_address_months_count', '<i2'), ('current_address_months_count', '<i2'), ('customer_age', 'u1'), ('days_since_request', '<f2'), ('intended_balcon_amount', '<f2'), ('payment_type', 'u1'), ('zip_count_4w', '<u2'), ('velocity_6h', '<f4'), ('velocity_24h', '<f4'), ('velocity_4w', '<f4'), ('bank_branch_count_8w', '<u2'), ('date_of_birth_distinct_emails_4w', 'u1'), ('employment_status', '<u2'), ('credit_risk_score', '<i2'), ('email_is_free', 'u1'), ('housing_status', 'u1'), ('phone_home_valid', 'u1'), ('phone_mobile_valid', 'u1'), ('bank_months_count', 'i1'), ('has_other_cards', 'u1'), ('proposed_credit_limit', '<f2'), ('foreign_request', 'u1'), ('source', 'u1'), ('session_length_in_minutes',

Now that we've parsed the data and verified some basic properties about it, we are ready to further process it into a useful form. First, we will min-max normalize the data:

In [4]:
# min-max normalize each column
normdata = np.zeros(data.shape, dtype=np.float32)
for colix in range(data.shape[1]):
    col = data[:, colix]
    colmin = col.min()
    colmax = col.max()
    colrange = colmax - colmin
    if colrange != 0:
        normdata[:, colix] = (col - colmin) / colrange
    else:
        normdata[:,colix] = data[:,colix]
print(normdata[3])

[1.0000000e+00 1.0000000e+00 1.5954469e-01 0.0000000e+00 5.3613052e-02
 5.0000000e-01 2.4316796e-04 1.1151484e-01 2.5000000e-01 1.2076429e-01
 2.1483068e-01 3.3565724e-01 4.7136649e-02 8.0545074e-01 1.5384616e-01
 0.0000000e+00 5.0089443e-01 1.0000000e+00 0.0000000e+00 0.0000000e+00
 1.0000000e+00 9.6969700e-01 1.0000000e+00 5.2356021e-03 0.0000000e+00
 0.0000000e+00 3.5074048e-02 5.0000000e-01 0.0000000e+00 6.6666669e-01
 0.0000000e+00 1.0000000e+00]


In [22]:
is_fraud_ground_truth = np.int_(normdata[:,:1])
real_dataset = normdata[:,1:]  # skip the fraud bool
#np.array([np.array(row)[1:] for row in rawdata]) # skip the fraud bool
training_size = math.floor(len(real_dataset)*0.85)
training_dataset = real_dataset[:training_size]
evaluation_dataset = real_dataset[training_size:]
evaluation_dataset_labels = is_fraud_ground_truth[training_size:]
training_labels = is_fraud_ground_truth[:training_size]
print(f'in the training set, {training_labels.sum()} are fraud ({100*training_labels.sum()/len(training_labels):.2f}%)')
print(f'in the evaluation set, {evaluation_dataset_labels.sum()} are fraud ({100*evaluation_dataset_labels.sum()/len(evaluation_dataset_labels):.2f}%)')

in the training set, 9577 are fraud (1.13%)
in the evaluation set, 1452 are fraud (0.97%)


In [12]:
def dbg_ndarray(val):
    print(type(val))
    print(type(val.dtype))
    print(val.shape)
dbg_ndarray(is_fraud_ground_truth)
dbg_ndarray(training_labels)
print(training_dataset[3:10])
dbg_ndarray(training_dataset)

<class 'numpy.ndarray'>
<class 'numpy.dtype[int64]'>
(1000000, 1)
<class 'numpy.ndarray'>
<class 'numpy.dtype[int64]'>
(850000, 1)
[[1.00000000e+00 1.59544691e-01 0.00000000e+00 5.36130518e-02
  5.00000000e-01 2.43167960e-04 1.11514837e-01 2.50000000e-01
  1.20764293e-01 2.14830682e-01 3.35657239e-01 4.71366495e-02
  8.05450737e-01 1.53846160e-01 0.00000000e+00 5.00894427e-01
  1.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  9.69696999e-01 1.00000000e+00 5.23560215e-03 0.00000000e+00
  0.00000000e+00 3.50740477e-02 5.00000000e-01 0.00000000e+00
  6.66666687e-01 0.00000000e+00 1.00000000e+00]
 [1.00000000e+00 5.96190810e-01 0.00000000e+00 5.10489523e-01
  5.00000000e-01 5.66095005e-05 1.14874728e-01 2.50000000e-01
  1.32706374e-01 3.07408094e-01 1.73997954e-01 6.28260076e-02
  8.34381580e-01 5.12820520e-02 0.00000000e+00 8.31842601e-01
  1.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  9.69696999e-01 0.00000000e+00 6.85863853e-01 0.00000000e+00
  0.00000000e+0

In [13]:
print(real_dataset[0:2])

[[1.00000000e+00 1.66868925e-01 0.00000000e+00 2.07459211e-01
  5.00000000e-01 2.66706600e-04 1.10534236e-01 0.00000000e+00
  1.14643976e-01 6.40842199e-01 2.23480448e-01 2.48955518e-01
  4.19287215e-04 1.53846160e-01 0.00000000e+00 6.35062635e-01
  0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  7.57575750e-01 0.00000000e+00 1.62303671e-01 0.00000000e+00
  0.00000000e+00 5.62504418e-02 0.00000000e+00 0.00000000e+00
  6.66666687e-01 0.00000000e+00 1.00000000e+00]
 [1.00000000e+00 2.96385705e-01 0.00000000e+00 3.37995350e-01
  5.00000000e-01 6.90597008e-05 1.14540257e-01 2.50000000e-01
  5.44857457e-02 4.17294428e-02 1.67013466e-01 7.16115832e-02
  3.01048219e-01 7.69230798e-02 0.00000000e+00 7.67441869e-01
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.84848499e-01 0.00000000e+00 6.85863853e-01 0.00000000e+00
  0.00000000e+00 3.77435505e-01 0.00000000e+00 0.00000000e+00
  6.66666687e-01 0.00000000e+00 1.00000000e+00]]


In [25]:
#Model old
model: keras.Model = keras.Sequential([
    #keras.layers.Dense((32**2) * 4, activation='relu'),
    #keras.layers.Dense((32**2) * 2, activation='relu'),
    keras.layers.Dense(32**2, activation='relu'),
    keras.layers.Dense((32**2)/4, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [26]:
#compile
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.TruePositives()])


In [27]:
#fit
history = model.fit(x=training_dataset, y=is_fraud_ground_truth_training, epochs=3, validation_split=0.1)

Epoch 1/3


2022-12-11 15:08:15.274111: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-11 15:11:37.991306: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


In [28]:
history = model.evaluate(evaluation_dataset, evaluation_dataset_labels)



In [29]:
model.predict(evaluation_dataset[5])



ValueError: in user code:

    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/engine/training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/engine/training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/engine/training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/engine/training.py", line 1983, in predict_step
        return self(x, training=False)
    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/ember/Library/Python/3.10/lib/python/site-packages/keras/engine/input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential_2" "                 f"(type Sequential).
    
    Input 0 of layer "dense_10" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (None,)
    
    Call arguments received by layer "sequential_2" "                 f"(type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=False
      • mask=None
