### Anomaly Detector


##### PURPOSE:  
This program implements a embedded neural network in tensorflow to perform an encoder/decoder anomaly detector for different loan types.  


##### INPUT: 
Loan data by type and features developed in the prior programs.

##### OUTPUT: 
Anomaly detection on a known data set. 

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

Import standard python and sklearn libraries

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os as os
from joblib import dump,load

Import tensorflow and tensorflow libraries

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,callbacks,losses,optimizers,initializers,models,regularizers
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization,Embedding,Flatten,concatenate,Input
from tensorflow.keras.callbacks import CSVLogger, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.losses import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from tensorflow.keras.optimizers import SGD,RMSprop,Adam,Adamax
from tensorflow.keras.initializers import RandomNormal,RandomUniform,TruncatedNormal,Glorot_Normal,Normal
from tensorflow.keras.metrics import mae, mse, mape
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Model, save_model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import mae, mse, mape

## Set seed for initializers

In [7]:
tf.set_random_seed(79)

ModuleNotFoundError: No module named 'tensorflow'

Check for gpu and expect this output:

[
  name: "/cpu:0"device_type: "CPU",
  name: "/gpu:0"device_type: "GPU"
]


In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12355073655885624654
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 13582938859459649461
physical_device_desc: "device: XLA_CPU device"
]


#### Read data file

In [35]:
df = load(os.getcwd() + 'loan_data')
df.sort_values(by=['',''],inplace=True)
df.reset_index(drop=True,inplace=True)

#### Function to convert dates to categorical variables

In [None]:
# Function to convert dates to categorical variables - retain raw dataframe df
def add_date_features(data,date,name):
    data[name + 'Yr'] = data[date].dt.year
    #data[name + 'Day'] = data[date].dt.dayofyear
    #data[name + 'Week'] = data[date].dt.week
    #data[name + 'Mon'] = data[date].dt.month 
    data[name + 'Qtr'] = data[date].dt.quarter
    data.drop([date], axis = 1, inplace = True)
    data.reset_index(drop=True, inplace=True)
    return data

In [None]:
# Form date features and drop datetime entry
dates = ['','']
for d in dates:
    dt = add_date_features[df,date,name] # date is column name and name is new column name
    return dt

#### Identify categorical, continuous, and time columns

In [None]:
# Display column names
dt.columns

# Choose Categorical Feature Vars
cat_vars = ['', '', '']

# Choose Time Features Vars
time_vars = ['', '']

# # Choose Continuous Feature Vars To Be Scaled
cont_vars = ['','']


### Create categorical input layer

In [None]:
def cat_encode(dt,cat_vars):
    cat_transformer = ColumnTransformer(
    [("cat_encoder", LabelEncoder(), cat_vars)]
    dt = cat_transformer.fit_transform(dt)
    return dt

In [38]:
def cat_embed(dt, cat_vars, emax=6, emin=3):
    cat_class = [len(df[c].unique()) for c in cat_vars]
    cat_class_dict = dict(zip(cat_vars,cat_class))
    cat_emb = [emax if x > emax else x for x in cat_class]
    cat_emb = [emin if x < emin else x for x in cat_emb]
    cat_emb_dict = dict(zip(cat_vars,cat_emb))
    return cat_class_dict, cat_vars_dict

In [None]:
def cat_input(var,cat_vars_dict,cat_class_dict,r=.2):
    name = var
    c1 = cat_class_dict[name]
    c2 = cat_vars_dict[name]
    inp = Input(shape=(1,),dtype='int64',name=name + '_in')
    #embedding layer is map of number of classes (c1) to number of embedded features (c2)
    ct = Flatten(name=name+'_flt')(Embedding(c1,c2, embeddings_initializer='glorot_normal')(inp))
     # add dense layers and dropout
    ct = Dense(128, activation='relu',kernel_initializer='glorot_normal',bias_initializer='normal')(ct)
    ct = Dropout(rate=r)(ct)
    return inp,ct

### Create time input layers

In [None]:
def time_encode(dt,time_vars):
    time_transformer = ColumnTransformer(
    [("time_encoder", LabelEncoder(), time_vars)]
    dt = time_transformer.fit_transform(dt)
    return dt

In [None]:
def time_embed(dt, time_vars, emax=6, emin=3):
    time_class = [len(df[t].unique()) for t in time_vars]
    time_class_dict = dict(zip(time_vars,time_class))
    time_emb = [emax if x > emax else x for x in time_class]
    time_emb = [emin if x <= emin else x for x in time_emb]
    time_emb_dict = dict(zip(time_vars,time_emb))
    return time_class_dict, time_vars_dict

In [None]:
def time_input(var,time_vars_dict,time_class_dict,r=.2):
    name = var
    c1 = time_class_dict[name]
    c2 = time_vars_dict[name]
    inp = Input(shape=(1,),dtype='int64',name=name + '_in')
    #embedding layer is map of number of classes (c1) to number of embedded features (c2)
    t = Flatten(name=name+'_flt')(Embedding(c1,c2,embeddings_initializer='glorot_normal')(inp))
    t = Dense(128, activation='relu',kernel_initializer='glorot_normal',bias_initializer='normal')(t)
    t = Dropout(rate=r)(t)
    return inp,t

### Create continuous input layer

##### Scalers

In [None]:
# Scaler
# s can be standardscaler,robustscaler or minmaxscaler; default is minmax
# x,y is limit on minmax; default to 0,1
# l,u is percential rank for the robust scaler based on median; default is 10,90

def cont_scaler(dt, var, scaler='minmax', x=1, y=5, l=10, u=90): # s can be standardscaler,robustscaler or minmaxscaler
    # select scaler map and form list of tuples for variable and scaler
  if scaler == 'standard':
      var_scaled = StandardScaler().fit_transform(dt[var])

  elif scaler == 'robust':
      var_scaled = RobustScaler(with_centering=True,with_scaling=True,quantile_range=(l,u)).fit_transform(dt[var])

  elif scaler == 'minmax':
      var_scaled = MinMaxScaler(feature_range = (x,y)).fit_transform(dt[var])

  # return map of scaler and continuous variables tuples
  return cont_scaled

##### Scaler Function

In [None]:
def cont_scale_var(dt, var, scaler):  #scaler can be 'standard','robust','minmax', var is a 'column name' 
    dt[var] = cont_scaler(var, scaler)
    return dt
    

##### Normalize and scale continuous variables to relieve skew and kurtosis

In [None]:
def standarize_scale(dt,cont_vars):
    for var in cont_vars:
        dt = cont_scale_var(dt, var, 'standard') #normalize distribution on mean = 0
        dt = cont_scale_var(dt, var, 'minmax')  #scale distribution to positive range
    return dt

In [None]:
def cont_input(var):
    name = var
    inp = Input((1,), name=name+'_in')
    d = Dense(1, name = name + '_d')(inp)
    d = Dense(128, activation='relu',kernel_initializer='glorot_normal',bias_initializer='normal')(d)
    d = Dropout(rate=r)(d)
    return inp,d

Set train, test, validate sets with validation as one quarter of each year and test at last quarter

In [44]:
def split_data(df,vstart=2,tstart=1):
    dates = list(df.Date.unique())
    dates.sort()
    dates_validate = dates[-vstart:]
    #dates_test = dates[-tstart:]
    dates_train = dates[:-vstart]
    data = df.sort_values(by=['ID','Date'])
    data_train = data.loc[data.Date.isin(dates_train)]
    data_validate = data.loc[data.Date.isin(dates_validate)]
    return data_train,data_validate,data

In [59]:
data_train,data_validate,data=split_data(df)

Encode and scale data and reshape into array of vectors. 
___________________________________________________________________________________________________________
Since the input layer of the neural network is a horizontally concatenated layer of each categorical variable in its own embedding input shared with the continuous variables each in its own dense input the train, validate and test data needs to be reshaped into a list of vectors for each feature.  To keep the array in mixed dtypes (i.e., int and float), input data is a list of arrays with each element in the list being a vector for the shared input features.

In [60]:
def map_shape_data(data_train,data_validate,cat_map_fit,cont_map_fit):
    #set target variables
    y_tr = data_train.REV.values.reshape(-1,1)
    y_val = data_validate.REV.values.reshape(-1,1)
    #transform categorical data
    cat_train = cat_map_fit.transform(data_train).astype(np.int64)
    cat_validate = cat_map_fit.transform(data_validate).astype(np.int64)
    #transform continuous variables
    cont_train = cont_map_fit.transform(data_train).astype(np.float32)
    cont_validate = cont_map_fit.transform(data_validate).astype(np.float32)
    #combine categorical and continuous data into array of vectors
    data_tr = np.hsplit(cat_train,cat_train.shape[1])+np.hsplit(cont_train,cont_train.shape[1])
    data_val = np.hsplit(cat_validate,cat_validate.shape[1])+np.hsplit(cont_validate,cont_validate.shape[1])
    return y_tr,y_val,data_tr,data_val

In [61]:
y_tr,y_val,data_tr,data_val = map_shape_data(data_train,data_validate,cat_map_fit,cont_map_fit)

Function to create single input vector (input_shape = 1) for categorical input layer

In [62]:
def cat_input(feat,cat_vars_dict):
    name = feat[0]
    c1 = len(feat[1].classes_)
    c2 = cat_vars_dict[name]
    if c2 > 50:c2 = 50
    if c2 < 5:c2 = 5
    inp = Input(shape=(1,),dtype='int64',name=name + '_in')
    #no third dimension for a time distributed series so flattened into column of 1
    #embedding layer is map of number of classes (c) to number of embedded features (c2)
    u = Flatten(name=name+'_flt')(Embedding(c1,c2,input_length=1)(inp))
    return inp,u

Create list of Input,Flatten,and Embedding layers for the categorical features

In [63]:
embs = [cat_input(feat,cat_vars_dict) for feat in cat_map_fit.features]

The deprecation warning is an incompatibility between keras and tensorflow.keras.  The error message is an outstanding bug in tensorflow and does not occur in keras.  Tensorflow has an open issue report regarding this error message.

Function to create Input and Dense layer for continuous features

In [64]:
def cont_input(feat):
    name = feat[0][0]
    inp = Input((1,), name=name+'_in')
    d = Dense(1, name = name + '_d')(inp)
    return inp,d

Create list of Input and Dense layers for continuous features

In [65]:
conts = [cont_input(feat) for feat in cont_map_fit.features]

Build a four layer model using a shared input layer for the categorical and continuous variables.  The hideen 2 layers are high node counts because sample count in input data is large. 

In [66]:
def embed_model(conts,embs):
    #concatenate the inputs and embedded layers with the inputs and continuous dense layers
    #referred to as 'shared layers' in tensorflow.keras documentation
    x = concatenate([emb for inp,emb in embs] + [d for inp,d in conts])
    #apply L2 normalization using the BatchNormalization method on continuous features
    x = Dense(128, activation='relu',kernel_initializer='uniform',bias_initializer='zeros')(x)
    #apply small dropout for first normalization
    x = Dropout(rate=0.6)(x)
    #apply additional L2 normalization using the BatchNormalization method
    x =	BatchNormalization()(x)
    x = Dense(128, activation='relu',kernel_initializer='uniform',bias_initializer='zeros')(x)
    #apply small dropout for normalization
    x =	Dropout(rate=0.6)(x)
    #apply L2 normalization using the BatchNormalization method
    x = BatchNormalization()(x)
    x = Dense(64,activation='relu',kernel_initializer='uniform',bias_initializer='zeros')(x)  
    x =	Dropout(rate=0.6)(x)
    #apply L2 normalization using the BatchNormalization method
    x = BatchNormalization()(x)
    x = Dense(1, activation='relu',kernel_initializer='uniform',bias_initializer='zeros')(x)
    model = Model([inp for inp,emb in embs] + [inp for inp,d in conts], x)
    model.compile(optimizer='Adam',loss='mean_absolute_error',metrics=['mape'])
    return model

Implement logger,reduce the learning rate when loss function change gets small,add early stopping and build model

In [67]:
csv_logger = CSVLogger('Partner_Error.csv')
rlr = ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=5,min_lr=0.0001)
mc = ModelCheckpoint('Partner_Best_Model',save_best_only=True)
model = embed_model(conts,embs)

This next process is cpu/gpu intensive.  This code should be run on a gpu.

In [68]:
model.fit(data_tr,y_tr,batch_size=64,epochs=25,verbose=1,validation_data = (data_val,y_val),callbacks=[csv_logger,rlr,mc])

Train on 7000 samples, validate on 1000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f9541b062b0>

In [79]:
model.load_weights('Partner_Best_Model')

In [80]:
def prediction(model_data,model=model):
    pred = model.predict(model_data)
    return pred

In [81]:
pred_tr = prediction(data_tr)
pred_val = prediction(data_val)

In [82]:
def array_to_list(arr):
    listed = [item for sublist in arr for item in sublist]
    return listed

In [83]:
def results_to_dataframe(df,pred_tr,pred_val,y_tr,y_val):
    pred_tr = array_to_list(pred_tr)
    pred_val = array_to_list(pred_val)
    preds = pred_tr + pred_val
    actuals = list(y_tr) + list(y_val)
    dr = pd.DataFrame()
    dr['Date'] = df.Date
    dr['ID'] = df.ID
    dr['Actual'] = actuals
    dr['Predict'] = preds
    dr = dr.loc[:,['ID','Date','Actual','Predict']]
    dr.to_pickle('Scaled_Predictions_Qtr.pkl')
    return

In [84]:
results_to_dataframe(df,pred_tr,pred_val,y_tr,y_val)

##### End of code: Close this file using File 'Close and Halt' from dropdown menu