In [74]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [75]:
transactions = pd.read_csv(r"C:\Users\Carlos\Documents\Python Projects\Predict Credit Card Fraud\transactions.txt")

In [76]:
transactions

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.00,0.00,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.30,C1342616552,0.00,0.00,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.00,0.00,C462716348,0.00,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.00,0.00,C1971700992,56933.09,86581.10,0
...,...,...,...,...,...,...,...,...,...,...
199994,201,PAYMENT,1274.97,C361372882,69376.00,68101.03,M671547467,0.00,0.00,0
199995,212,CASH_OUT,204041.98,C1621547576,40423.00,0.00,C1571552280,0.00,204041.98,0
199996,160,CASH_IN,66378.61,C1431532774,2852895.88,2919274.49,C1831159716,264223.39,197844.78,0
199997,12,CASH_IN,87473.93,C695681900,1674588.27,1762062.20,C371793088,245563.25,158089.32,0


In [77]:
transactions.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,243.289836,180242.5,831436.1,852333.3,1093644.0,1218886.0,0.00141
std,141.800473,625548.2,2882314.0,2917352.0,3302878.0,3627192.0,0.037524
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13387.46,0.0,0.0,0.0,0.0,0.0
50%,238.0,74266.95,14201.0,0.0,132057.2,213809.8,0.0
75%,334.0,208637.6,107848.9,144962.9,941029.1,1109082.0,0.0
max,741.0,52042800.0,50399050.0,40399050.0,235932700.0,311404900.0,1.0


In [78]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [79]:
transactions.nunique()

step                 524
type                   5
amount            198565
nameOrig          199990
oldbalanceOrg      98238
newbalanceOrig     86422
nameDest          174523
oldbalanceDest    114777
newbalanceDest    122915
isFraud                2
dtype: int64

Looking at the dataset, combined with our knowledge of credit card transactions in general, we can see that there are a few interesting columns to look at. We know that the amount of a given transaction is going to be important. Calculate summary statistics for this column. What does the distribution look like?




In [80]:
transactions.amount.describe()

count    1.999990e+05
mean     1.802425e+05
std      6.255482e+05
min      0.000000e+00
25%      1.338746e+04
50%      7.426695e+04
75%      2.086376e+05
max      5.204280e+07
Name: amount, dtype: float64

We have a lot of information about the type of transaction we are looking at. Let’s create a new column called isPayment that assigns a 1 when type is “PAYMENT” or “DEBIT”, and a 0 otherwise.




In [81]:
transactions['isPayment'] = 0

In [82]:
transactions['isPayment'][transactions['type'].isin(['PAYMENT', 'DEBIT'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions['isPayment'][transactions['type'].isin(['PAYMENT', 'DEBIT'])] = 1


Similarly, create a column called isMovement, which will capture if money moved out of the origin account. This column will have a value of 1 when type is either “CASH_OUT” or “TRANSFER”, and a 0 otherwise.




In [83]:
transactions['isMovement'] = 0

In [84]:
transactions['isMovement'][transactions['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions['isMovement'][transactions['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1


With financial fraud, another key factor to investigate would be the difference in value between the origin and destination account. Our theory, in this case, being that destination accounts with a significantly different value could be suspect of fraud. Let’s create a column called accountDiff with the absolute difference of the oldbalanceOrg and oldbalanceDest columns.




In [85]:
transactions['accountDiff'] = abs(transactions['oldbalanceOrg'] - transactions['oldbalanceDest'])

In [86]:
transactions.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0,0,1,474016.32
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0,0,1,2720411.37
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0,0,0,7885498.11
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0,0,1,10865.0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0,0,1,56933.09


Before we can start training our model, we need to define our features and label columns. Our label column in this dataset is the isFraud field. Create a variable called features which will be an array consisting of the following fields:

- amount
- isPayment
- isMovement
- accountDiff

Also create a variable called label with the column isFraud.




In [87]:
features = transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']]

In [88]:
label = transactions['isFraud']

Split the data into training and test sets using sklearn‘s train_test_split() method. We’ll use the training set to train the model and the test set to evaluate the model. Use a test_size value of 0.3.




In [89]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)

Since sklearn‘s Logistic Regression implementation uses Regularization, we need to scale our feature data. Create a StandardScaler object, .fit_transform() it on the training features, and .transform() the test features.




In [90]:
scaler = StandardScaler()

In [91]:
X_train = scaler.fit_transform(X_train)

In [92]:
X_test = scaler.transform(X_test)

Create a LogisticRegression model with sklearn and .fit() it on the training data.

Fitting the model find the best coefficients for our selected features so it can more accurately predict our label. We will start with the default threshold of 0.5.




In [93]:
model = LogisticRegression()

In [94]:
model.fit(X_train, y_train)

Run the model’s .score() method on the training data and print the training score.

Scoring the model on the training data will process the training data through the trained model and will predict which transactions are fraudulent. The score returned is the percentage of correct classifications, or the accuracy.




In [95]:
model.score(X_train, y_train)

0.9986142758162558

Run the model’s .score() method on the test data and print the test score.

Scoring the model on the test data will process the test data through the trained model and will predict which transactions are fraudulent. The score returned is the percentage of correct classifications, or the accuracy, and will be an indicator for the sucess of your model.

How did you model perform?




In [96]:
model.score(X_test, y_test)

0.99845

Print the coefficients for our model to see how important each feature column was for prediction. Which feature was most important? Least important?




In [97]:
model.coef_

array([[ 0.21493917, -0.72573687,  2.25466587, -0.52769801]])

Let’s use our model to process more transactions that have gone through our systems. There are three numpy arrays pre-loaded in the workspace with information on new sample transactions under “New transaction data”

Create a fourth array, your_transaction, and add any transaction information you’d like. Make sure to enter all values as floats with a .!





In [98]:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

In [99]:
your_transaction = np.array([6472.54, 1.0, 0.0, 55901.23])

Combine the new transactions and your_transaction into a single numpy array called sample_transactions.




In [100]:
sample_transactions = np.stack((transaction1, transaction2, transaction3, your_transaction))

In [101]:
sample_transactions = scaler.transform(sample_transactions)



Which transactions are fraudulent? Use your model’s .predict() method on sample_transactions and print the result to find out.

Want to see the probabilities that led to these predictions? Call your model’s .predict_proba() method on sample_transactions and print the result. The 1st column is the probability of a transaction not being fraudulent, and the 2nd column is the probability of a transaction being fraudulent (which was calculated by our model to make the final classification decision).




In [104]:
model.predict(sample_transactions)

array([0, 0, 0, 0], dtype=int64)

In [103]:
model.predict_proba(sample_transactions)

array([[9.96764579e-01, 3.23542135e-03],
       [9.99992547e-01, 7.45325359e-06],
       [9.99991871e-01, 8.12898791e-06],
       [9.99992832e-01, 7.16789439e-06]])