In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
# Load the data
transactions = pd.read_csv('transactions_modified.csv')
print(transactions.info())
transactions.head()

# How many fraudulent transactions?
transactions.isFraud.sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            1000 non-null   int64  
 1   type            1000 non-null   object 
 2   amount          1000 non-null   float64
 3   nameOrig        1000 non-null   object 
 4   oldbalanceOrg   1000 non-null   float64
 5   newbalanceOrig  1000 non-null   float64
 6   nameDest        1000 non-null   object 
 7   oldbalanceDest  1000 non-null   float64
 8   newbalanceDest  1000 non-null   float64
 9   isFraud         1000 non-null   int64  
 10  isPayment       1000 non-null   int64  
 11  isMovement      1000 non-null   int64  
 12  accountDiff     1000 non-null   float64
dtypes: float64(6), int64(4), object(3)
memory usage: 101.7+ KB
None


282

In [13]:
# Summary statistics on amount column
transactions.amount.describe()

count    1.000000e+03
mean     5.373080e+05
std      1.423692e+06
min      0.000000e+00
25%      2.933705e+04
50%      1.265305e+05
75%      3.010378e+05
max      1.000000e+07
Name: amount, dtype: float64

In [15]:
## Create isPayment field
transactions['isPayment'] = 0
transactions.isPayment[transactions['type'].isin(['PAYMENT', 'DEBIT'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions.isPayment[transactions['type'].isin(['PAYMENT', 'DEBIT'])] = 1


In [17]:
## Create isMovement field
transactions['isMovement'] = 0
transactions.isMovement[transactions['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions.isMovement[transactions['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1


In [18]:
## Create accountDiff field
transactions['accountDiff'] = abs(\
    transactions.oldbalanceDest - transactions.oldbalanceOrg)

In [20]:
# Create features and label variables
features = transactions[['amount', 'isPayment', 'isMovement', 'accountDiff']]
label = transactions.isFraud

In [22]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(\
    features, label, test_size=0.3)

In [23]:
# Normalize the features variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
# Fit the model to the training data
model = LogisticRegression()
model.fit(X_train, y_train)

In [25]:
# Score the model on the training data
print(model.score(X_train, y_train))

0.8371428571428572


In [26]:
# Score the model on the test data
print(model.score(X_test, y_test))

0.8466666666666667


In [27]:
# Print the model coefficients
print(model.coef_)

[[ 2.54967119 -0.64515568  2.11654798 -1.66591394]]


In [30]:
# New transaction data
transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

In [51]:
# Create a new transaction
your_transaction = np.array([320768.65, 0.0, 1.0, 40083.3])

In [52]:
# Combine new transactions into a single array
sample_transactions = np.stack((transaction1, transaction2, transaction3, your_transaction))

In [53]:
# Normalize the new transactions
sample_transactions = scaler.transform(sample_transactions)



In [54]:
# Predict fraud on the new transactions
predictions = model.predict(sample_transactions)
predictions

array([0, 0, 0, 1], dtype=int64)

In [55]:

# Show probabilities on the new transactions
predictions_prob = model.predict_proba(sample_transactions)
predictions_prob

array([[0.55112259, 0.44887741],
       [0.99787624, 0.00212376],
       [0.9963797 , 0.0036203 ],
       [0.46391882, 0.53608118]])