In [30]:
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [31]:
RANDOM_STATE = 23
TEST_SIZE = 0.2

In [32]:
# download from https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset
data = pd.read_csv("dataset.csv")

In [33]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [35]:
data.isnull().sum().max()

0

No hay valores nulos en todo el dataset

La variable de salida es la columna 'isFraud'. El resto serian variables de entrada.

Veamos la distribucion de esa variable

In [36]:
y = data['isFraud']

In [37]:
y.value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

Vemos que hay muy pocas transacciones fraudulentas

In [38]:
X = data.drop('isFraud', axis=1)

Analicemos la variable 'type'



In [39]:
X['type'].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

Vamos a reemplazarla por dummies

In [40]:
dummies_type = pd.get_dummies(X['type'], prefix='type')
dummies_type.head()

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,False,False,False,True,False
1,False,False,False,True,False
2,False,False,False,False,True
3,False,True,False,False,False
4,False,False,False,True,False


In [41]:
X.drop('type', axis=1, inplace=True)
X = pd.concat([X, dummies_type], axis=1)
X.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,False,False,False,True,False
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,False,False,False,True,False
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,0,False,False,False,False,True
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,0,False,True,False,False,False
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,False,False,False,True,False


Analicemos ahora las columnas de nameOrig y nameDest

In [42]:
names = ['nameDest', 'nameOrig']
X.filter(items=names).describe()

Unnamed: 0,nameDest,nameOrig
count,6362620,6362620
unique,2722362,6353307
top,C1286084959,C1902386530
freq,113,3


Tenemos aproximadamente 2700000 y 6350000 valores distintos para estas columnas categoricas que no parecieran brindar mucha informacion, y reemplazarla por dummies seria contraproducente ya que estariamos agregando casi 9000000 columnas nuevas a nuestro dataset.

Por lo que no las tomaremos como variables de entrada

In [43]:
X.drop('nameDest', axis=1, inplace=True)
X.drop('nameOrig', axis=1, inplace=True)

Veamos la matriz de correlacion

In [44]:
corr = data.corr(numeric_only=True)
corr['isFraud'].sort_values()

newbalanceOrig   -0.008148
oldbalanceDest   -0.005885
newbalanceDest    0.000535
oldbalanceOrg     0.010154
step              0.031578
isFlaggedFraud    0.044109
amount            0.076688
isFraud           1.000000
Name: isFraud, dtype: float64

Vamos a escalar las variables numericas

In [45]:
numeric = ['newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'oldbalanceOrg', 'step', 'amount']

In [46]:
X.filter(items=numeric).describe()

Unnamed: 0,newbalanceOrig,oldbalanceDest,newbalanceDest,oldbalanceOrg,step,amount
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,855113.7,1100702.0,1224996.0,833883.1,243.3972,179861.9
std,2924049.0,3399180.0,3674129.0,2888243.0,142.332,603858.2
min,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,156.0,13389.57
50%,0.0,132705.7,214661.4,14208.0,239.0,74871.94
75%,144258.4,943036.7,1111909.0,107315.2,335.0,208721.5
max,49585040.0,356015900.0,356179300.0,59585040.0,743.0,92445520.0


In [47]:
scaler = StandardScaler()
for col in numeric:
    X[col] = scaler.fit_transform(X[col].values.reshape((-1, 1)))

In [48]:
X.filter(items=numeric).describe()

Unnamed: 0,newbalanceOrig,oldbalanceDest,newbalanceDest,oldbalanceOrg,step,amount
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,6.590049e-16,1.498762e-16,-1.340452e-16,-1.779646e-17,-1.909724e-16,-4.9467360000000006e-17
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.2924417,-0.3238139,-0.3334114,-0.2887164,-1.703042,-0.2978545
25%,-0.2924417,-0.3238139,-0.3334114,-0.2887164,-0.6140381,-0.2756812
50%,-0.2924417,-0.2847734,-0.2749863,-0.2837972,-0.0308943,-0.1738653
75%,-0.2431065,-0.04638324,-0.03077931,-0.2515606,0.6435853,0.04779197
max,16.66523,104.412,96.60911,20.34149,3.510123,152.7936


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=TEST_SIZE)

In [50]:
y_train.value_counts()

isFraud
0    5083526
1       6570
Name: count, dtype: int64

In [51]:
y_test.value_counts()

isFraud
0    1270881
1       1643
Name: count, dtype: int64

Utilizando oversampling, generaremos otro dataset de entrenamiento, para poder tener mas transacciones fraudulentas a la hora de entrenar el modelo y compararlo contra el dataset original.

In [52]:
ros = RandomOverSampler(sampling_strategy='minority')

In [53]:
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [54]:
y_train_ros.value_counts()

isFraud
0    5083526
1    5083526
Name: count, dtype: int64

Llego la hora de probar varios modelos para quedarnos con el mejor
Compararemos los siguientes modelos:

- Logistic regression
- Decision Tree
- SVC (solo linearSVC debido a la gran cantidad de data que tenemos)

In [55]:
results = []

In [56]:
def analyze_model(y, y_pred):
    #cm = confusion_matrix(y, y_pred)
    #sns.heatmap(cm, annot=True, fmt=',d', cmap='rocket')
    #plt.ylabel('Valor real')
    #plt.xlabel('Valor predicho')

    return get_model_stats(y, y_pred)
'''
Sabemos que por cada transacción aprobada el porcentaje de ganancia es de un
20%, y por cada fraude aprobado se pierde el 100% del dinero de la transacción.
'''
def calculate_model_income(y_pred):
    income = 0
    for index, result in enumerate(y_pred):
        # no marcada como fraude
        if result == 0:
            if data['isFraud'][index] == 1:
                # efectivamente era un fraude
                income -= data['amount'][index]
            else:
                # no era fraude
                income += data['amount'][index] * 0.2

    return income


def get_model_stats(y, y_pred):
    recall = recall_score(y, y_pred)
    precision = precision_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)

    income = calculate_model_income(y_pred)
    print(income, recall, precision, f1, accuracy)
    return income, recall, precision, f1, accuracy

Logistic Regression - No Oversampling

In [57]:
lr_no_os = LogisticRegression(random_state=RANDOM_STATE)
lr_no_os.fit(X_train, y_train)
y_test_pred = lr_no_os.predict(X_test)
results.append(['Logistic Regression', False, *analyze_model(y_test, y_test_pred)])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


39136616050.79571 0.43274497869750456 0.9330708661417323 0.5912681912681913 0.9992275194809686


Logistic Regression - Oversampling

In [58]:
lr_os = LogisticRegression(random_state=RANDOM_STATE)
lr_os.fit(X_train_ros, y_train_ros)
y_test_pred = lr_os.predict(X_test)
results.append(['Logistic Regression', True, *analyze_model(y_test, y_test_pred)])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


37245622195.34243 0.9762629336579428 0.025788195951703403 0.05024905234798409 0.9523513898362624


Decision Tree - No Oversampling

In [59]:
criterions = ['entropy', 'gini']
depths = [3, 5]

In [60]:
for c in criterions:
    for d in depths:
        tree_no_os = DecisionTreeClassifier(criterion=c, max_depth=d, random_state=RANDOM_STATE)
        tree_no_os.fit(X_train, y_train)
        y_test_pred = tree_no_os.predict(X_test)
        results.append([f'Decision Tree - {c} - max_depth={str(d)}', False, *analyze_model(y_test, y_test_pred)])

39109784704.031715 0.699939135727328 0.7334183673469388 0.7162877608221738 0.9992840999462486
39119775289.64167 0.714546561168594 0.9310071371927042 0.8085399449035813 0.9995630730736709
39150776081.93973 0.1898965307364577 0.9968051118210862 0.31901840490797545 0.9989532613923195
39109608728.83373 0.7285453438831406 0.7595177664974619 0.7437092264678471 0.9993516821686663


Decision Tree - Oversampling

In [61]:
for c in criterions:
    for d in depths:
        tree_os = DecisionTreeClassifier(criterion=c, max_depth=d, random_state=RANDOM_STATE)
        tree_os.fit(X_train_ros, y_train_ros)
        y_test_pred = tree_os.predict(X_test)
        results.append([f'Decision Tree - {c} - max_depth={str(d)}', True, *analyze_model(y_test, y_test_pred)])

34644922809.97459 0.9981740718198417 0.011089473114789571 0.021935250884431993 0.8850701440601513
37783530595.821884 0.9987827145465612 0.037049580059604445 0.07144878632850768 0.9664815752001534
36735418274.12688 0.9202678027997565 0.019382627422828428 0.03796562503923366 0.9397834539859367
38582466759.20006 0.9586122945830797 0.08542141230068337 0.1568646979732085 0.9866949464214427


SVC - No Oversampling

In [62]:
svc_no_os = LinearSVC(random_state=RANDOM_STATE)
svc_no_os.fit(X_train, y_train)
y_test_pred = svc_no_os.predict(X_test)
results.append(['Linear SVC', False, *analyze_model(y_test, y_test_pred)])



39138605755.65974 0.4023128423615338 0.9593613933236574 0.5668953687821613 0.9992063018064885


SVC - Oversampling

In [63]:
svc_os = LinearSVC(random_state=RANDOM_STATE)
svc_os.fit(X_train_ros, y_train_ros)
y_test_pred = svc_os.predict(X_test)
results.append(['Linear SVC', False, *analyze_model(y_test, y_test_pred)])



37317053634.25821 0.9659160073037127 0.02684189161761721 0.05223229713495812 0.954741128654548


In [91]:
results_df = pd.DataFrame(results, columns=['Model', 'oversampling', 'income', 'recall', 'precision', 'f1', 'accuracy'])

In [None]:
results_df.sort_values(by='income', ascending=False)

In [None]:
results_df.sort_values(by='recall', ascending=False)

In [None]:
results_df.sort_values(by='precision', ascending=False)

El modelo que nos maximiza la ganancia es el Decision Tree, usando gini y con una profundidad maxima de 3, sin oversamplear la data de training. Sin embargo, es el modelo que tiene el peor recall de todos, por lo que emplearemos un algoritmo para decidir cual es el mejor modelo

Para ello, pondremos foco en 3 variables, sinedo el income la mas importante, seguida por el recall, y por ultimo la precision.
Se le asignara un puntaje a cada modelo, donde se evaluara que tan lejos estuvo del mejor modelo para esa columna especifica

In [95]:
results_df['score'] = 0

In [96]:
MAX_INCOME = results_df['income'].max()
MAX_RECALL = results_df['recall'].max()
MAX_PRECISION = results_df['precision'].max()

In [97]:
def get_score(row):
    income_points = row['income']/MAX_INCOME
    recall_points = row['recall']/MAX_RECALL
    precision_points = row['precision']/MAX_PRECISION

    return income_points * 5 + recall_points * 4 + precision_points * 2


results_df['score'] = results_df.apply(get_score, axis=1)

In [98]:
results_df.drop('f1', axis=1, inplace=True)
results_df.drop('accuracy', axis=1, inplace=True)

In [113]:
results_df.sort_values(by='score', ascending=False)

Unnamed: 0,Model,oversampling,income,recall,precision,score
3,Decision Tree - entropy - max_depth=5,False,39119780000.0,0.714547,0.931007,9.725693
5,Decision Tree - gini - max_depth=5,False,39109610000.0,0.728545,0.759518,9.43638
2,Decision Tree - entropy - max_depth=3,False,39109780000.0,0.699939,0.733418,9.269472
9,Decision Tree - gini - max_depth=5,True,38582470000.0,0.958612,0.085421,8.937933
7,Decision Tree - entropy - max_depth=5,True,37783530000.0,0.998783,0.03705,8.899724
1,Logistic Regression,True,37245620000.0,0.976263,0.025788,8.718243
11,Linear SVC,False,37317050000.0,0.965916,0.026842,8.688042
0,Logistic Regression,False,39136620000.0,0.432745,0.933071,8.603404
10,Linear SVC,False,39138610000.0,0.402313,0.959361,8.534531
6,Decision Tree - entropy - max_depth=3,True,34644920000.0,0.998174,0.011089,8.444364


En la tabla se puede ver como en los modelos donde se hizo el oversampling se obtuvo un recall bastante alto, pero la precision es bastante mala. A si mismo, se puede observar como pasa tambien lo contrario, es decir, donde no hubo oversampling, el recall no supera el 70%, pero la precision es bastante alta. Esto tiene sentido al comprender como cambia la composicion del material de entrenamiento dado en cada caso vs el material sobre el que se los evalua.

Esto afecta a nuestro problema de la siguiente manera:

- Los modelos que se entrenaron con la data oversampleada tienen buen recall, ya que no se les escapa casi ninguna transaccion fraudelenta, pero terminan marcando como fraudes muchisimas transacciones que eran validas (menor precision). Por lo que el banco deberia tener una persona/equipo analizando manualmente todas las transacciones marcadas como fraudulentas, y se podria sumar el costo por hora o transaccion analizada por ese equipo, para jugar con los valores de recall vs precision.

- Los modelos donde no se hizo el oversampling a la data, tienen recall mas bajo, por lo que se les escapan mas transacciones fraudulentas, pero la precision general es mayor. Esto tampoco dice mucho, ya que si ponemos un modelo que siempre prediga que las transacciones son verdaderas, por mas que el recall de 0, la precision va a dar de 99%, ya que esa es la proporcion de transacciones validas.

Asumiendo lo que mencione anteriormente, que seria que haya un costo asociado al hecho de tener un equipo que revise de manera manual todas las transacciones que marcamos como fraudulenta, me quedaria con el modelo de  Decision Tree, con entropy y max depth de 5, sin oversamplear la data de entrenamiento.
Este modelo tuvo el 4to mejor income, sin estar tan alejado de los primeros puestos, y un recall medianamente aceptable, de ~0.7. Esto quiere decir que se le escapan aproximadamente 3 de 10 transacciones fraudulentas, pero asi y todo, para este caso concreto, tiene una de las ganancias mas altas y una precision considerablemente alta, por lo que no habia que andar revisando muchas transacciones .