# Tackling imbalanced dataset


# Baseline - original dataset without any data sampling techniques

In [3]:
# source: https://www.kaggle.com/ntnu-testimon/paysim1
import os
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sn



df = pd.read_csv("PS_20174392719_1491204439457_log.csv", na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index))
df.head()

Using TensorFlow backend.


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
988091,45,PAYMENT,9995.19,C582306715,0.0,0.0,M851906868,0.0,0.0,0,0
5510167,380,CASH_OUT,255843.13,C740843483,1629.0,0.0,C2002336407,35667.07,186000.25,0,0
353740,17,CASH_OUT,551496.84,C163854984,0.0,0.0,C363476583,1801470.48,2352967.32,0,0
1800044,162,CASH_OUT,40857.05,C1214901421,0.0,0.0,C944858368,4578162.1,4619019.15,0,0
5710043,398,PAYMENT,3544.93,C784730036,0.0,0.0,M904019815,0.0,0.0,0,0


In [4]:
enc = LabelEncoder()

def columnEncoder(columns):
    for column_name,space in columns:
        enc.fit(df[column_name])
        enc_type = enc.transform(df[column_name])
        df.insert(space,"encoded_" + column_name,enc_type)
        df.pop(column_name)

change = [['nameOrig',4],['nameDest',7],['type',1]]
col_enc = columnEncoder(change)
df[:5]

Unnamed: 0,step,encoded_type,amount,encoded_nameOrig,oldbalanceOrg,newbalanceOrig,encoded_nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
988091,45,3,9995.19,4979193,0.0,0.0,2556944,0.0,0.0,0,0
5510167,380,1,255843.13,5500828,1629.0,0.0,296654,35667.07,186000.25,0,0
353740,17,1,551496.84,2097253,0.0,0.0,383840,1801470.48,2352967.32,0,0
1800044,162,1,40857.05,705081,0.0,0.0,555792,4578162.1,4619019.15,0,0
5710043,398,3,3544.93,5646123,0.0,0.0,2615181,0.0,0.0,0,0


In [5]:

#Splitting into samples and labels
x = np.array(df.iloc[:,:8])
y = np.array(df.iloc[:,9])
one_hot = np_utils.to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x,one_hot,test_size=0.20,random_state=21)



In [6]:
model = Sequential()
model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(one_hot.shape[1],activation='softmax'))

In [7]:
opt = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt)
model.fit(X_train,y_train,batch_size=128,verbose=2,epochs=5)


Epoch 1/5
 - 52s - loss: 0.0078
Epoch 2/5
 - 52s - loss: 0.0081
Epoch 3/5
 - 54s - loss: 0.0081
Epoch 4/5
 - 53s - loss: 0.0078
Epoch 5/5
 - 56s - loss: 0.0077


<keras.callbacks.callbacks.History at 0x116095b90>

In [9]:
from sklearn import metrics

pred = model.predict(X_test) 
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1) 
accuracy = metrics.accuracy_score(y_compare, pred)
precision = metrics.average_precision_score(y_compare, pred)
print("Accuracy score: {}".format(accuracy))
print("precision score: {}".format(precision))

y_test_1d = np.argmax(y_test, axis=1)
matrix = confusion_matrix(y_test_1d, pred)
np.set_printoptions(suppress=True)
normalized_m = preprocessing.normalize(matrix)
print("Confusion matrix: \n",normalized_m)


Accuracy score: 0.9989061110045862
precision score: 0.15646058584252998
Confusion matrix: 
 [[1.         0.0000181 ]
 [0.98013762 0.19831857]]


In [11]:
from sklearn.metrics import f1_score
f1=f1_score(y_test_1d, pred)
f1

0.2846865364850976

# First technique: undersampling

In [12]:
# source: https://www.kaggle.com/ntnu-testimon/paysim1
import os
import pandas as pd
import numpy as np

df = pd.read_csv("PS_20174392719_1491204439457_log.csv", na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1595368,156,PAYMENT,863.57,C536133662,99971.0,99107.43,M717485208,0.0,0.0,0,0
4006047,298,TRANSFER,1649793.38,C23322457,0.0,0.0,C1117590961,1878584.74,3528378.11,0,0
6251879,596,PAYMENT,14688.51,C495369611,0.0,0.0,M1063576313,0.0,0.0,0,0
3062051,234,CASH_IN,79624.44,C859513068,7424.0,87048.44,C576172791,0.0,0.0,0,0
983025,44,PAYMENT,13788.31,C1740847955,130507.0,116718.69,M1274936624,0.0,0.0,0,0


In [13]:
""" 
The creator of this simulation writes that:
isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. 
An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

Therefore, only using 'isFraud' label to separate actual fraudulent transactions.
"""

dfLegitimate = df[df.isFraud == 0]
dfFraud = df[df.isFraud == 1]
dfLegitimate[:5]


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1595368,156,PAYMENT,863.57,C536133662,99971.0,99107.43,M717485208,0.0,0.0,0,0
4006047,298,TRANSFER,1649793.38,C23322457,0.0,0.0,C1117590961,1878584.74,3528378.11,0,0
6251879,596,PAYMENT,14688.51,C495369611,0.0,0.0,M1063576313,0.0,0.0,0,0
3062051,234,CASH_IN,79624.44,C859513068,7424.0,87048.44,C576172791,0.0,0.0,0,0
983025,44,PAYMENT,13788.31,C1740847955,130507.0,116718.69,M1274936624,0.0,0.0,0,0


In [14]:
print(dfFraud.shape)
print(dfLegitimate.shape)

(8213, 11)
(6354407, 11)


In [15]:
#Setting exactly the same size for both categories
dfLegit = dfLegitimate.head(8213)

In [16]:
undersampled = pd.concat([dfLegit,dfFraud])
undersampled

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1595368,156,PAYMENT,863.57,C536133662,99971.00,99107.43,M717485208,0.00,0.00,0,0
4006047,298,TRANSFER,1649793.38,C23322457,0.00,0.00,C1117590961,1878584.74,3528378.11,0,0
6251879,596,PAYMENT,14688.51,C495369611,0.00,0.00,M1063576313,0.00,0.00,0,0
3062051,234,CASH_IN,79624.44,C859513068,7424.00,87048.44,C576172791,0.00,0.00,0,0
983025,44,PAYMENT,13788.31,C1740847955,130507.00,116718.69,M1274936624,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6272963,627,CASH_OUT,953529.64,C969013741,953529.64,0.00,C782712589,0.00,953529.64,1,0
3193206,245,CASH_OUT,688034.32,C1899112763,688034.32,0.00,C362018196,0.00,688034.32,1,0
6030085,472,CASH_OUT,653983.93,C1069288733,653983.93,0.00,C1731869105,659112.38,1313096.31,1,0
1059504,99,TRANSFER,286670.90,C2005707747,286670.90,0.00,C1830145397,0.00,0.00,1,0


In [18]:
from sklearn.preprocessing import LabelEncoder

#undersampled.pop("isFlaggedFraud")

enc = LabelEncoder()
def columnEncoder(columns):
    for column_name,space in columns:
        enc.fit(undersampled[column_name])
        enc_type = enc.transform(undersampled[column_name])
        undersampled.insert(space,"encoded_" + column_name,enc_type)
        undersampled.pop(column_name)

change = [['nameOrig',4],['nameDest',7],['type',1]]

undersampled_enc = columnEncoder(change)

KeyError: 'nameOrig'

In [19]:
# Second reshuffling for to mix both categories
undersampled = undersampled.reindex(np.random.permutation(undersampled.index))
undersampled[:5]

Unnamed: 0,step,encoded_type,amount,encoded_nameOrig,oldbalanceOrg,newbalanceOrig,encoded_nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6360384,715,1,212028.15,6861,212028.15,0.0,8916,3708381.73,3920409.88,1,0
1511329,148,1,288032.02,11756,288032.02,0.0,4083,0.0,288032.02,1,0
782395,39,4,39013.3,796,39013.3,0.0,3258,0.0,0.0,1,0
1876102,164,3,12588.49,11994,24349.0,11760.51,16021,0.0,0.0,0,0
5381206,376,1,166330.22,4721,71617.26,0.0,8630,190152.28,356482.5,0,0


In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
#Splitting into samples and labels
x = np.array(undersampled.iloc[:,:8])
y = np.array(undersampled.iloc[:,9])
x[:5]

array([[    715.  ,       1.  ,  212028.15,    6861.  ,  212028.15,
              0.  ,    8916.  , 3708381.73],
       [    148.  ,       1.  ,  288032.02,   11756.  ,  288032.02,
              0.  ,    4083.  ,       0.  ],
       [     39.  ,       4.  ,   39013.3 ,     796.  ,   39013.3 ,
              0.  ,    3258.  ,       0.  ],
       [    164.  ,       3.  ,   12588.49,   11994.  ,   24349.  ,
          11760.51,   16021.  ,       0.  ],
       [    376.  ,       1.  ,  166330.22,    4721.  ,   71617.26,
              0.  ,    8630.  ,  190152.28]])

In [21]:
from keras.utils import np_utils

# convert integers to dummy variables (i.e. one hot encoded)
one_hot = np_utils.to_categorical(y)
one_hot[:5]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(x,one_hot,test_size=0.20,random_state=21)


### Training part

In [39]:
from keras.models import Sequential
from keras.layers.core import Dense

model = Sequential()
model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(one_hot.shape[1],activation='softmax'))

In [40]:
from keras.optimizers import Adam

opt = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt)
model.fit(X_train,y_train,batch_size=64,verbose=2,epochs=50)
#action = model.predict(image)

Epoch 1/50
 - 1s - loss: 0.3201
Epoch 2/50
 - 0s - loss: 0.1709
Epoch 3/50
 - 0s - loss: 0.1721
Epoch 4/50
 - 0s - loss: 0.1578
Epoch 5/50
 - 0s - loss: 0.1448
Epoch 6/50
 - 0s - loss: 0.1423
Epoch 7/50
 - 0s - loss: 0.1600
Epoch 8/50
 - 0s - loss: 0.1626
Epoch 9/50
 - 0s - loss: 0.1514
Epoch 10/50
 - 0s - loss: 0.1523
Epoch 11/50
 - 0s - loss: 0.1454
Epoch 12/50
 - 0s - loss: 0.1440
Epoch 13/50
 - 0s - loss: 0.1525
Epoch 14/50
 - 0s - loss: 0.1558
Epoch 15/50
 - 0s - loss: 0.1544
Epoch 16/50
 - 0s - loss: 0.1566
Epoch 17/50
 - 1s - loss: 0.1593
Epoch 18/50
 - 0s - loss: 0.1545
Epoch 19/50
 - 0s - loss: 0.1580
Epoch 20/50
 - 0s - loss: 0.1556
Epoch 21/50
 - 0s - loss: 0.1451
Epoch 22/50
 - 0s - loss: 0.1434
Epoch 23/50
 - 1s - loss: 0.1429
Epoch 24/50
 - 1s - loss: 0.1398
Epoch 25/50
 - 0s - loss: 0.1436
Epoch 26/50
 - 1s - loss: 0.1471
Epoch 27/50
 - 0s - loss: 0.1451
Epoch 28/50
 - 0s - loss: 0.1439
Epoch 29/50
 - 0s - loss: 0.1399
Epoch 30/50
 - 1s - loss: 0.1392
Epoch 31/50
 - 1s -

<keras.callbacks.callbacks.History at 0x119607d10>

In [41]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing

#using test set for prediction
pred = model.predict(X_test) 
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1) 
accuracy = metrics.accuracy_score(y_compare, pred)
precision = metrics.average_precision_score(y_compare, pred)
print("Accuracy score: {}".format(accuracy))
print("precision score: {}".format(precision))

#formatting labels from one_hot encoding
y_test_1d = np.argmax(y_test, axis=1)
matrix = confusion_matrix(y_test_1d, pred)

normalized_m = preprocessing.normalize(matrix)
print(normalized_m)

from sklearn.metrics import f1_score
f1=f1_score(y_test_1d, pred)
print("F1 score: ",f1)

Accuracy score: 0.9433962264150944
precision score: 0.9316156334243828
[[0.99950769 0.03137468]
 [0.08866822 0.99606122]]
F1 score:  0.9430147058823529


# Another techniques
# weight adjustment

In [1]:
# source: https://www.kaggle.com/ntnu-testimon/paysim1
import os
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline



df2 = pd.read_csv("PS_20174392719_1491204439457_log.csv", na_values=['NA', '?'])
df2 = df2.reindex(np.random.permutation(df2.index))
df2.head()

Using TensorFlow backend.


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2892050,228,PAYMENT,12348.84,C1984975377,555962.0,543613.16,M865101115,0.0,0.0,0,0
1849081,163,PAYMENT,14534.41,C1425462448,25533.0,10998.59,M1616539308,0.0,0.0,0,0
6328585,688,CASH_IN,39846.67,C601240722,20640.0,60486.67,C616209107,2908578.83,2868732.15,0,0
3696063,277,PAYMENT,5438.7,C1670301032,0.0,0.0,M521814532,0.0,0.0,0,0
5151126,357,CASH_OUT,7396.85,C1408043643,50754.0,43357.15,C1807882356,98922.71,106319.56,0,0


In [2]:
enc = LabelEncoder()

def columnEncoder(columns):
    for column_name,space in columns:
        enc.fit(df2[column_name])
        enc_type = enc.transform(df2[column_name])
        df2.insert(space,"encoded_" + column_name,enc_type)
        df2.pop(column_name)

change = [['nameOrig',4],['nameDest',7],['type',1]]

col_enc = columnEncoder(change)


df2[:5]

Unnamed: 0,step,encoded_type,amount,encoded_nameOrig,oldbalanceOrg,newbalanceOrig,encoded_nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2892050,228,3,12348.84,3236851,555962.0,543613.16,2571642,0.0,0.0,0,0
1849081,163,3,14534.41,1397221,25533.0,10998.59,1258567,0.0,0.0,0,0
6328585,688,0,39846.67,5041810,20640.0,60486.67,458166,2908578.83,2868732.15,0,0
3696063,277,3,5438.7,2201309,0.0,0.0,2189967,0.0,0.0,0,0
5151126,357,1,7396.85,1339643,50754.0,43357.15,239015,98922.71,106319.56,0,0


In [9]:
df2.head()

Unnamed: 0,step,encoded_type,amount,encoded_nameOrig,oldbalanceOrg,newbalanceOrig,encoded_nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3466159,257,3,19465.33,2454799,163256.17,143790.84,1640552,0.0,0.0,0,0
2112758,183,3,8370.94,5580091,398885.99,390515.05,2452910,0.0,0.0,0,0
3502303,259,4,596881.02,375457,104549.0,0.0,152654,1053756.15,1650637.18,0,0
68581,9,3,3882.62,6143570,539325.42,535442.8,1130168,0.0,0.0,0,0
516916,20,3,2261.25,3720867,0.0,0.0,2259223,0.0,0.0,0,0


In [3]:
#Splitting into samples and labels
x = np.array(df2.iloc[:,:8])
y = np.array(df2.iloc[:,9])
one_hot = np_utils.to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x,one_hot,test_size=0.20,random_state=21)

In [4]:
model = Sequential()
model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(one_hot.shape[1],activation='softmax'))

In [7]:
opt = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt)
model.fit(X_train,y_train,batch_size=128,class_weight="balanced",verbose=2,epochs=10)
#action = model.predict(image)

Epoch 1/10
 - 53s - loss: 0.0077
Epoch 2/10
 - 48s - loss: 0.0077
Epoch 3/10
 - 55s - loss: 0.0079
Epoch 4/10
 - 57s - loss: 0.0076
Epoch 5/10
 - 59s - loss: 0.0076
Epoch 6/10
 - 46s - loss: 0.0076
Epoch 7/10
 - 46s - loss: 0.0080
Epoch 8/10
 - 48s - loss: 0.0078
Epoch 9/10
 - 46s - loss: 0.0080
Epoch 10/10
 - 46s - loss: 0.0080


<keras.callbacks.callbacks.History at 0x1a33ad9b10>

In [11]:
from sklearn import metrics

pred = model.predict(X_test) 
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1) 
accuracy = metrics.accuracy_score(y_compare, pred)
precision = metrics.average_precision_score(y_compare, pred)
print("Accuracy score: {}".format(accuracy))
print("Precision score: {}".format(precision))

y_test_1d = np.argmax(y_test, axis=1)
matrix = confusion_matrix(y_test_1d, pred)
np.set_printoptions(suppress=True)
normalized_m = preprocessing.normalize(matrix)
print("Confusion matrix: \n",normalized_m)


Accuracy score: 0.9988558172576706
Precision score: 0.12588265072781843
Confusion matrix: 
 [[1.         0.00001416]
 [0.98808398 0.15391572]]


In [12]:
from sklearn.metrics import f1_score
f1=f1_score(y_test_1d, pred)
f1

0.23529411764705882

# Upsampling of minority class

In [13]:
from sklearn.utils import resample

dfLegitimate = df2[df2.isFraud == 0]
dfFraud = df2[df2.isFraud == 1]

upsample_f = resample(dfFraud,
                          replace=True,
                          n_samples=len(dfLegitimate),
                          random_state=21)


In [14]:
len(upsample_f)

6354407

In [15]:
df2.size

69988820

In [16]:

#Splitting into samples and labels
x = np.array(df2.iloc[:,:8])
y = np.array(df2.iloc[:,9])
one_hot = np_utils.to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x,one_hot,test_size=0.20,random_state=21)



In [17]:
model = Sequential()
model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid'))
model.add(Dense(20, activation='sigmoid'))
model.add(Dense(one_hot.shape[1],activation='softmax'))

In [18]:
opt = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt)
model.fit(X_train,y_train,batch_size=128,verbose=2,epochs=5)


Epoch 1/5
 - 57s - loss: 0.0081
Epoch 2/5
 - 62s - loss: 0.0081
Epoch 3/5
 - 51s - loss: 0.0081
Epoch 4/5
 - 51s - loss: 0.0078
Epoch 5/5
 - 49s - loss: 0.0076


<keras.callbacks.callbacks.History at 0x1a325e8710>

In [19]:
pred = model.predict(X_test) 
pred = np.argmax(pred,axis=1)
y_compare = np.argmax(y_test,axis=1) 
accuracy = metrics.accuracy_score(y_compare, pred)
precision = metrics.average_precision_score(y_compare, pred)
print("Accuracy score: {}".format(accuracy))
print("precision score: {}".format(precision))

y_test_1d = np.argmax(y_test, axis=1)
matrix = confusion_matrix(y_test_1d, pred)
np.set_printoptions(suppress=True)
normalized_m = preprocessing.normalize(matrix)
print("Confusion matrix: \n",normalized_m)


Accuracy score: 0.9989163269219283
precision score: 0.17714978278780935
Confusion matrix: 
 [[1.         0.00004958]
 [0.96713163 0.25427625]]


In [20]:
from sklearn.metrics import f1_score
f1=f1_score(y_test_1d, pred)
f1

0.3341380975374215