In [1]:
import os
colab = os.environ.get('COLAB_GPU', '10')
if (int(colab) == 0):
    from google.colab import drive
    drive.mount('/content/drive')
    #Path="drive/My Drive/Colab Notebooks/StockAnalysis"
    Path=""
    DataPath="/content"
    RootPath="/root"    
else:
    #Path="/c/DataScience/Repo/Imbalanced_data"
    Path="/DataScience/Repo/Imbalanced_data"
    DataPath=Path
    RootPath="/Users/iowahawk89"

In [9]:
import numpy as np
import pandas as pd

X = pd.read_csv('creditcard.csv', na_filter=True)

y_original = np.array(X['Class'], dtype='int')

print(X['Class'].value_counts())
X.drop(['Class'], inplace=True, axis=1)

rolling_window_size = 10  ### this selects how many historical transactions should be analyzed to judge the transaction at hand -- RNN width

X_interim = np.zeros([(X.shape[0]-rolling_window_size)*10,30])
y = []
for i in range((X.shape[0]-rolling_window_size)):
    beg = 0+i
    end = beg+rolling_window_size
    s = np.array(X[beg:end], dtype='float')
    X_interim[(rolling_window_size*i):(rolling_window_size*(i+1)),:] = s
    y.append(y_original[end])


y = np.array(y, dtype='int')
X_interim = X_interim[:,1::]

In [14]:
def display_metrics(model_name, train_features, test_features, train_label, test_label, pred):
    try:
        print(model_name.score(test_features, test_label)) 
        print("Accuracy score (training): {0:.3f}".format(model_name.score(train_features, train_label))) 
        print("Accuracy score (validation): {0:.3f}".format(model_name.score(test_features, test_label))) 
    except Exception as e:
        print("error")  
    try:
        print(pd.Series(model_name.feature_importances_, index=train_features.columns[:]).nlargest(10).plot(kind='barh')) 
    except Exception as e:
        print("error")  

    print("Confusion Matrix:")
    tn, fp, fn, tp = confusion_matrix(test_label, pred).ravel()
    total = tn+ fp+ fn+ tp 
    print("false positive pct:",(fp/total)*100) 
    print("tn", " fp", " fn", " tp") 
    print(tn, fp, fn, tp) 
    print(confusion_matrix(test_label, pred)) 
    print("Classification Report") 
    print(classification_report(test_label, pred))
    print("Specificity =", tn/(tn+fp))
    print("Sensitivity =", tp/(tp+fn))
    return tn, fp, fn, tp

In [15]:
X_tensor = X_interim.reshape(int(np.shape(X_interim)[0]/rolling_window_size), rolling_window_size, np.shape(X_interim)[1])
#TypeError: 'float' object cannot be interpreted as an integer

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers.recurrent import LSTM
from keras.layers import Embedding
#from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

test_train_split = 0.5
stratify = True

if stratify:
    y = np.vstack((range(len(y)),y)).T
    y_pos = y[y[:,1]==1]
    y_neg = y[y[:,1]==0]
    
    y_pos = y_pos[np.random.choice(y_pos.shape[0], int(y_pos.shape[0]*test_train_split), replace=False),:]
    y_neg = y_neg[np.random.choice(y_neg.shape[0], int(y_neg.shape[0]*test_train_split), replace=False),:]
    
    train_idx = np.array(np.hstack((y_pos[:,0],y_neg[:,0])), dtype='int')
    
    X_train = X_tensor[train_idx, :, :]
    X_test = np.delete(X_tensor, train_idx, axis=0)
    y_train = y[train_idx,1]
    y_test = np.delete(y, train_idx, axis=0)
    y_test = y_test[:,1]
else: 
    train_idx = np.random.choice(X_tensor.shape[0], int(X_tensor.shape[0]*test_train_split), replace=False)
    X_train = X_tensor[train_idx, :, :]
    X_test = np.delete(X_tensor, train_idx, axis=0)
    y_train = y[train_idx]
    y_test = np.delete(y, train_idx, axis=0)

del (X_tensor, y, stratify, test_train_split, train_idx, y_neg, y_pos)


### Hyperparameters Tuning
# First test optimal epochs holding everything else constant
# Dropout: 0.1-0.6
# GradientClipping: 0.1-10
# BatchSize: 32,64,128,256,512 (power of 2)


### Train LSTM using Keras 2 API ###
model = Sequential()
model.add(LSTM(20, input_shape=X_train.shape[1:], kernel_initializer='lecun_uniform', activation='relu', kernel_regularizer=regularizers.l1(0.1), recurrent_regularizer=regularizers.l1(0.01), bias_regularizer=None, activity_regularizer=None, dropout=0.2, recurrent_dropout=0.2))#, return_sequences=True))
#model.add(LSTM(12, activation='relu', return_sequences=True))
#model.add(LSTM(8, activation='relu'))
#model.add(Dense(1, kernel_initializer='lecun_uniform', activation='sigmoid'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy']) #optimizer='rmsprop', optimizer='sgd', optimizer='adam'
print(model.summary())



Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 20)                4000      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 21        
Total params: 4,021
Trainable params: 4,021
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
class_names=[0,1] # name  of classes 1=fraudulent transaction
#model.fit(X_train, y_train, epochs=200, batch_size=10000, class_weight={0 : 1., 1: float(int(1/np.mean(y_train)))}, validation_split=0.3)
model.fit(X_train, y_train, epochs=4, batch_size=32, class_weight=np.where(y_train == 1,4.0,1.0).flatten() )

train_predict = model.predict_classes(X_train)
test_predict = model.predict_classes(X_test)

### test AUC ###
from sklearn import metrics 

fpr, tpr, thresholds = metrics.roc_curve(y_train, train_predict, pos_label=1)
print('TRAIN | AUC Score: ' + str((metrics.auc(fpr, tpr))))
fpr, tpr, thresholds = metrics.roc_curve(y_test, test_predict, pos_label=1)
print('TEST | AUC Score: ' + str((metrics.auc(fpr, tpr))))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
TRAIN | AUC Score: 0.5
TEST | AUC Score: 0.5


In [17]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
tn, fp, fn, tp = display_metrics(model, X_train, X_test, y_train, y_test, test_predict)

error
error
Confusion Matrix:
false positive pct: 0.0
tn  fp  fn  tp
142153 0 246 0
[[142153      0]
 [   246      0]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    142153
           1       0.00      0.00      0.00       246

    accuracy                           1.00    142399
   macro avg       0.50      0.50      0.50    142399
weighted avg       1.00      1.00      1.00    142399

Specificity = 1.0
Sensitivity = 0.0


  _warn_prf(average, modifier, msg_start, len(result))
