# Credit Card Fraud Detection

In the Credit Card Fraud Detection project, I utilized Automated Machine Learning (AutoML) to address the challenge of identifying fraudulent transactions. Credit card fraud is a growing concern in the financial industry, leading to significant financial losses and undermining consumer trust. Traditional methods of detecting fraud often rely on manually crafted rules and limited feature engineering, which may not adapt well to the evolving tactics of fraudsters.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
ss=pd.read_csv("/content/sample_submission.csv")
test=pd.read_csv("/content/test.csv")
train=pd.read_csv("/content/train.csv")

In [3]:
ss.head()

Unnamed: 0,id,IsFraud
0,150000,0.5
1,150001,0.5
2,150002,0.5
3,150003,0.5
4,150004,0.5


In [4]:
test.head()

Unnamed: 0,id,Time,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat20,feat21,feat22,feat23,feat24,feat25,feat26,feat27,feat28,Transaction_Amount
0,150000,73899,-0.681234,-0.371212,0.385966,0.955703,2.064078,0.338827,-0.539452,-0.254046,...,0.18356,-0.253324,0.266668,-0.153829,0.455969,-0.503628,0.257588,-0.456685,-0.298919,30.42
1,150001,73899,-1.098947,-0.959377,0.324934,0.703908,1.090582,-1.595909,0.584548,0.260069,...,0.334764,0.130108,0.676928,-0.16107,-0.638011,-0.273424,0.711132,0.349967,0.141233,23.0
2,150002,73899,0.977029,-0.270984,0.471526,-1.23257,0.957537,-0.636602,-0.95306,-1.491744,...,0.355728,0.517912,1.175087,-0.325895,-0.362636,0.306037,0.004828,0.037389,0.058222,198.0
3,150003,73900,1.176658,-0.225816,-0.2466,0.015513,1.103831,1.229516,-1.527098,-0.459769,...,-0.152613,-0.1046,0.0038,-0.02318,-0.458338,0.481427,-0.381415,0.080165,0.027372,9.99
4,150004,73900,0.804828,-0.272967,0.95991,-1.117567,0.395748,0.589855,1.05988,-1.101203,...,-0.127818,-0.011849,0.07854,-0.2417,0.01013,0.302614,-0.259568,0.023127,0.056957,239.0


In [5]:
train.head()

Unnamed: 0,id,Time,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat21,feat22,feat23,feat24,feat25,feat26,feat27,feat28,Transaction_Amount,IsFraud
0,0,0.0,2.074329,-0.129425,-1.137418,0.412846,-0.192638,-1.210144,0.110697,-0.263477,...,-0.334701,-0.88784,0.336701,-0.110835,-0.291459,0.207733,-0.076576,-0.059577,1.98,0.0
1,1,0.0,1.998827,-1.250891,-0.520969,-0.894539,-1.122528,-0.270866,-1.029289,0.050198,...,0.054848,-0.038367,0.133518,-0.461928,-0.465491,-0.464655,-0.009413,-0.038238,84.0,0.0
2,2,0.0,0.091535,1.004517,-0.223445,-0.435249,0.667548,-0.988351,0.948146,-0.084789,...,-0.326725,-0.803736,0.154495,0.951233,-0.506919,0.085046,0.224458,0.087356,2.69,0.0
3,3,0.0,1.979649,-0.184949,-1.064206,0.120125,-0.215238,-0.648829,-0.087826,-0.035367,...,-0.095514,-0.079792,0.167701,-0.042939,0.000799,-0.096148,-0.05778,-0.073839,1.0,0.0
4,4,0.0,1.025898,-0.171827,1.203717,1.2439,-0.636572,1.099074,-0.938651,0.569239,...,0.099157,0.608908,0.027901,-0.262813,0.257834,-0.252829,0.108338,0.021051,1.0,0.0


In [6]:
train.describe()

Unnamed: 0,id,Time,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat21,feat22,feat23,feat24,feat25,feat26,feat27,feat28,Transaction_Amount,IsFraud
count,14366.0,14366.0,14366.0,14366.0,14366.0,14366.0,14366.0,14366.0,14366.0,14366.0,...,14365.0,14365.0,14365.0,14365.0,14365.0,14365.0,14365.0,14365.0,14365.0,14365.0
mean,7182.5,6873.773072,0.248537,0.218925,0.715189,0.096392,-0.110662,-0.056791,-0.084031,0.043335,...,-0.101754,-0.126056,-0.02802,0.026596,0.083139,0.082677,0.006328,0.012296,40.624447,0.002715
std,4147.251319,5897.208532,1.287947,0.890647,1.093515,1.197663,0.873706,1.059438,0.723162,0.638792,...,0.397317,0.552574,0.226343,0.540655,0.383677,0.545754,0.226586,0.133804,92.928358,0.052036
min,0.0,0.0,-15.327861,-11.988986,-9.749795,-4.368057,-8.60773,-4.289413,-11.29167,-28.903442,...,-13.424967,-3.554416,-4.005097,-2.762916,-2.539074,-1.268044,-5.392926,-1.945036,0.0,0.0
25%,3591.25,1423.25,-0.73268,-0.263822,0.175114,-0.661726,-0.686898,-0.686469,-0.586622,-0.148226,...,-0.25291,-0.552389,-0.130014,-0.279865,-0.165769,-0.330575,-0.069089,-0.015994,4.49,0.0
50%,7182.5,4956.0,0.921422,0.164862,0.834597,0.105694,-0.18074,-0.270968,-0.081597,0.011052,...,-0.117041,-0.097152,-0.032835,0.039473,0.111705,0.043565,-0.007806,0.012915,14.95,0.0
75%,10773.75,11067.0,1.21824,0.82926,1.399929,0.822982,0.314951,0.195414,0.418736,0.2206,...,0.011209,0.233368,0.073557,0.385724,0.356913,0.395527,0.072271,0.058464,39.9,0.0
max,14365.0,20036.0,2.390816,9.936778,3.988147,6.534876,12.868453,5.520868,8.695427,7.908886,...,11.145524,3.443314,5.922873,3.5777,2.215975,3.119824,3.017098,1.929157,2622.26,1.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14366 entries, 0 to 14365
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  14366 non-null  int64  
 1   Time                14366 non-null  float64
 2   feat1               14366 non-null  float64
 3   feat2               14366 non-null  float64
 4   feat3               14366 non-null  float64
 5   feat4               14366 non-null  float64
 6   feat5               14366 non-null  float64
 7   feat6               14366 non-null  float64
 8   feat7               14366 non-null  float64
 9   feat8               14366 non-null  float64
 10  feat9               14366 non-null  float64
 11  feat10              14366 non-null  float64
 12  feat11              14366 non-null  float64
 13  feat12              14366 non-null  float64
 14  feat13              14366 non-null  float64
 15  feat14              14366 non-null  float64
 16  feat

In [8]:
train.isnull().sum()

id                    0
Time                  0
feat1                 0
feat2                 0
feat3                 0
feat4                 0
feat5                 0
feat6                 0
feat7                 0
feat8                 0
feat9                 0
feat10                0
feat11                0
feat12                0
feat13                0
feat14                0
feat15                0
feat16                0
feat17                1
feat18                1
feat19                1
feat20                1
feat21                1
feat22                1
feat23                1
feat24                1
feat25                1
feat26                1
feat27                1
feat28                1
Transaction_Amount    1
IsFraud               1
dtype: int64

In [9]:
train.dropna(inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
x=train.drop(["id","IsFraud"],axis=1)
y=train.IsFraud
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2,random_state=42)
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
pred=model.predict(x_test)
print(accuracy_score(y_test,pred))

0.9926905673512009


In [11]:
from sklearn.linear_model import LogisticRegression
model2=LogisticRegression()
model2.fit(x_train,y_train)
pred2=model2.predict(x_test)
print(accuracy_score(y_test,pred2))

0.9947789766794292


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
from sklearn.ensemble import RandomForestClassifier
model3=RandomForestClassifier()
model3.fit(x_train,y_train)
pred3=model3.predict(x_test)
print(accuracy_score(y_test,pred3))

0.995475113122172


In [13]:
feature_importance = model3.feature_importances_
columns=x.columns
feature_importance_sorting = sorted(zip(columns, feature_importance), key=lambda x: x[1], reverse=True)
for f, i in feature_importance_sorting:
    print(f"{f}: {i}")

feat14: 0.05326015020644415
feat17: 0.05232999293406308
feat4: 0.04888532985473971
feat11: 0.04447602085276543
feat12: 0.04434991113951727
feat19: 0.04433982263253656
feat24: 0.042488735064665745
feat18: 0.03728052119943399
feat25: 0.036273216508249446
feat20: 0.035227582773827565
feat8: 0.03480501009318192
feat7: 0.03379049916865258
feat22: 0.033485435892753504
feat16: 0.03320774434880554
feat3: 0.03158745509425419
feat23: 0.03027823352431157
feat2: 0.029728958650072686
feat13: 0.029427054353079733
feat9: 0.029059584275557167
feat6: 0.0289424876754128
feat21: 0.0277230363986793
feat26: 0.02741756524934006
feat1: 0.02711610260162411
Time: 0.02604909391645214
feat15: 0.025688781108370598
feat27: 0.02526760795936582
feat28: 0.024394127583044004
Transaction_Amount: 0.022385873574534143
feat10: 0.020397318915964902
feat5: 0.020336746450300317


In [14]:
for f, i in feature_importance_sorting:
    if i < 0.031:
        train.drop(f, axis=1, inplace=True)
        test.drop(f, axis=1, inplace=True)
    else:
        continue

In [15]:
test.isnull().sum()

id        0
feat3     0
feat4     0
feat7     1
feat8     1
feat11    1
feat12    1
feat14    1
feat16    1
feat17    1
feat18    1
feat19    1
feat20    1
feat22    1
feat24    1
feat25    1
dtype: int64

In [16]:
test.dropna(inplace=True)

In [17]:
train.drop("id",axis=1,inplace=True)

In [18]:
x=train.drop("IsFraud",axis=1)
y=train.IsFraud
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=42,test_size=.2)

In [19]:
from sklearn.preprocessing import scale
x_train=scale(x_train)
x_test=scale(x_test)

In [20]:
pip install pycaret



In [21]:
from pycaret.classification import*

In [22]:
setup(train,target="IsFraud")

Unnamed: 0,Description,Value
0,Session id,2056
1,Target,IsFraud
2,Target type,Binary
3,Original data shape,"(14365, 16)"
4,Transformed data shape,"(14365, 16)"
5,Transformed train set shape,"(10055, 16)"
6,Transformed test set shape,"(4310, 16)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x78872b07d120>

In [23]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9973,0.4961,0.0,0.0,0.0,0.0,0.0,0.4
ridge,Ridge Classifier,0.9973,0.6209,0.0,0.0,0.0,0.0,0.0,0.078
rf,Random Forest Classifier,0.9973,0.7364,0.0,0.0,0.0,0.0,0.0,2.609
et,Extra Trees Classifier,0.9973,0.6536,0.0,0.0,0.0,0.0,0.0,0.487
dummy,Dummy Classifier,0.9973,0.5,0.0,0.0,0.0,0.0,0.0,0.045
lr,Logistic Regression,0.9972,0.6427,0.0,0.0,0.0,-0.0001,-0.0002,1.181
xgboost,Extreme Gradient Boosting,0.9972,0.7039,0.0,0.0,0.0,-0.0001,-0.0002,0.289
lightgbm,Light Gradient Boosting Machine,0.9972,0.7512,0.0,0.0,0.0,-0.0001,-0.0002,1.101
ada,Ada Boost Classifier,0.9967,0.6418,0.0,0.0,0.0,-0.0008,-0.0009,1.412
svm,SVM - Linear Kernel,0.9964,0.6015,0.0,0.0,0.0,-0.0011,-0.0012,0.072


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [24]:
best_model=create_model("knn")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.997,0.497,0.0,0.0,0.0,0.0,0.0
1,0.997,0.4945,0.0,0.0,0.0,0.0,0.0
2,0.997,0.496,0.0,0.0,0.0,0.0,0.0
3,0.997,0.497,0.0,0.0,0.0,0.0,0.0
4,0.997,0.4975,0.0,0.0,0.0,0.0,0.0
5,0.998,0.4965,0.0,0.0,0.0,0.0,0.0
6,0.998,0.4975,0.0,0.0,0.0,0.0,0.0
7,0.998,0.4945,0.0,0.0,0.0,0.0,0.0
8,0.997,0.4945,0.0,0.0,0.0,0.0,0.0
9,0.997,0.4955,0.0,0.0,0.0,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [28]:
ids=test.id
test.drop("id",axis=1,inplace=True)
predictions=best_model.predict(test)
test["IsFraud"]=predictions

In [29]:
test["id"]=ids
test[["id","IsFraud"]].to_csv("submission.csv",index=False)

In [30]:
import pickle
with open("model.pkl","wb") as f:
    pickle.dump(best_model,f)

In [31]:
with open("model.pkl","rb") as f:
    model=pickle.load(f)
print(model)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
