### Read the data from the CSV file into a pandas dataframe

In [193]:
import pandas as pd
import numpy as np

In [194]:
df = pd.read_csv('fraud_train.csv')

### Pre-process and clean data

In [195]:
df.isnull().values.any() 

False

### Extract features
From table and the fraud column to use as label

In [196]:
features = df[['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']].values
label = df['fraud'].values 

In [197]:
features.shape

(80000, 7)

In [198]:
label.shape

(80000,)

Normalize

In [199]:
import sklearn
from sklearn.preprocessing import StandardScaler

In [200]:
sc = StandardScaler()

In [201]:
features = sc.fit_transform(features)

In [202]:
features

array([[-0.36073415, -0.1922989 , -0.34155142, ..., -0.6855018 ,
        -0.26837809, -1.73130058],
       [-0.33613008, -0.14466279,  0.17848452, ..., -0.6855018 ,
        -0.26837809,  0.57760045],
       [-0.00246312, -0.02951842, -0.43495647, ...,  1.45878536,
        -0.26837809,  0.57760045],
       ...,
       [-0.36952032, -0.19675162,  0.62551328, ...,  1.45878536,
        -0.26837809,  0.57760045],
       [-0.14343299, -0.22550408,  0.7995906 , ...,  1.45878536,
        -0.26837809,  0.57760045],
       [-0.28492826,  0.99422044,  0.15451168, ...,  1.45878536,
        -0.26837809,  0.57760045]])

In [203]:
label

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

### Split the dataset into train and test sets

In [204]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((64000, 7), (16000, 7))

### Logistic Regression model

In [205]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [206]:
accuracy = lr_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.9390625


### Other models

Multi Layer Perceptron (MLP)

In [208]:
from sklearn.neural_network import MLPClassifier

In [209]:
mlp_model = MLPClassifier(hidden_layer_sizes=(50, 50))

In [210]:
mlp_model.fit(X_train, y_train)

In [211]:
accuracy = mlp_model.score(X_test, y_test)
print("MLP Accuracy:", accuracy)

MLP Accuracy: 0.9979375


Support Vector Machines (SVM)

In [212]:
from sklearn import svm

In [213]:
svm_model = svm.SVC()

In [214]:
svm_model.fit(X_train, y_train)

In [215]:
svm_model.score(X_test, y_test)

0.98825

Decision Tree Classifier

In [216]:
from sklearn.tree import DecisionTreeClassifier

In [217]:
dt_model = DecisionTreeClassifier()

In [218]:
dt_model.fit(X_train, y_train)

In [219]:
dt_accuracy = dt_model.score(X_test, y_test)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

Decision Tree Classifier Accuracy: 0.9998125


### Neural Network using Tensor Flow

In [246]:
import tensorflow as tf

In [247]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [248]:
model = Sequential([
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [249]:
model.compile(optimizer='sgd', loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy()])

In [250]:
model.fit(X_train, y_train, batch_size=32, epochs=20)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x285dca82a50>

In [251]:
_, accuracy = model.evaluate(X_test, y_test)



### Predictions

In [237]:
df_test = pd.read_csv('fraud_test.csv')

In [238]:
X_new = df_test[['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']].values
X_new = sc.fit_transform(X_new)

In [239]:
X_new

array([[-0.42148017, -0.21928316, -0.45377841, ...,  1.45422819,
         3.69486328,  0.57196155],
       [-0.39350843, -0.24377492, -0.64670019, ..., -0.68764999,
        -0.27064601, -1.74836928],
       [-0.30782729, -0.23063695,  7.14621143, ..., -0.68764999,
        -0.27064601,  0.57196155],
       ...,
       [-0.16950877, -0.16655673, -0.57641984, ..., -0.68764999,
        -0.27064601,  0.57196155],
       [-0.40567535, -0.24087477, -0.64835175, ..., -0.68764999,
        -0.27064601,  0.57196155],
       [-0.09648299, -0.23953776, -0.57450714, ...,  1.45422819,
        -0.27064601,  0.57196155]])

In [240]:
y_pred = dt_model.predict(X_new)

In [241]:
y_pred = (np.rint(y_pred)).astype(int)
y_pred.shape
y_pred = y_pred.reshape(20000)

In [242]:
y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [243]:
df_submission = {'card_no': data_test['card_no'], 'fraud': y_pred}
df_submission = pd.DataFrame(data=df_submission)

In [244]:
df_submission.head()

Unnamed: 0,card_no,fraud
0,14487,0
1,16733,0
2,90819,1
3,38793,1
4,19484,0


In [245]:
df_submission.to_csv('dt_submission.csv', index=False)