## Credit Crad Fraud Detection

The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

#### Load Dataset

In [3]:
df=pd.read_csv('creditcard.csv')

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Imbalanced Data

In [5]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

#### Drop Time Column

In [7]:
df.drop('Time',inplace=True,axis=1)

#### Drop duplicate row

In [8]:
df=df[df.duplicated()==False]

In [9]:
df['Class'].value_counts()

Class
0    275190
1       473
Name: count, dtype: int64

In [10]:
X=df.drop('Class',axis=1)
y=df.iloc[:,-1]

#### Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2,stratify=y)

In [12]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((220530, 29), (55133, 29), (220530,), (55133,))

#### Model Training: RFC()

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
clf=RandomForestClassifier()
clf.fit(X_train,y_train)

clf.score(X_test,y_test)

0.999419585366296

In [15]:
from sklearn.metrics import classification_report

y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55038
           1       0.92      0.73      0.81        95

    accuracy                           1.00     55133
   macro avg       0.96      0.86      0.91     55133
weighted avg       1.00      1.00      1.00     55133



#### Model Training:LogisticRegression()

In [16]:
def train_test_split_data(X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)
    return X_train,X_test,y_train,y_test

In [17]:
from sklearn.linear_model import LogisticRegression

def model_build(X_train,X_test,y_train,y_test):
    clf=LogisticRegression()
    clf.fit(X_train,y_train)

    return clf,clf.score(X_test,y_test)

X_train,X_test,y_train,y_test=train_test_split_data(X,y)
clf,score=model_build(X_train,X_test,y_train,y_test)

print(score)

0.9992200678359603


In [18]:
from sklearn.metrics import classification_report

def performance(X_test,y_test,clf):
    y_pred=clf.predict(X_test)
    print(classification_report(y_test,y_pred))
performance(X_test,y_test,clf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55042
           1       0.86      0.63      0.73        91

    accuracy                           1.00     55133
   macro avg       0.93      0.81      0.86     55133
weighted avg       1.00      1.00      1.00     55133



## Resampling Techniques:

#### Oversampling

In [19]:
from imblearn.over_sampling import RandomOverSampler

ros=RandomOverSampler()

X_resample,y_resample=ros.fit_resample(X,y)

print("Before resample")
print(y.value_counts())
print("After resample")
print(y_resample.value_counts())

Before resample
Class
0    275190
1       473
Name: count, dtype: int64
After resample
Class
0    275190
1    275190
Name: count, dtype: int64


In [20]:
X_resample[X_resample.duplicated()]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
275663,0.269614,3.549755,-5.810353,5.809370,1.538808,-2.269219,-0.824203,0.351070,-3.759059,-4.592390,...,0.310525,0.371121,-0.322290,-0.549856,-0.520629,1.378210,0.564714,0.553255,0.402400,0.68
275664,1.140208,1.156431,-1.471578,2.076278,0.774809,-1.002532,0.264948,0.013162,0.248835,-2.100667,...,-0.125097,-0.387895,-0.866812,-0.121583,-0.356109,0.634573,-0.306311,0.094087,0.121065,1.00
275665,-5.100256,3.633442,-3.843919,0.183208,-1.183997,1.602139,-3.005953,-8.645038,1.285458,-3.717481,...,-2.806302,8.280439,-2.797150,1.090707,-0.159260,0.532156,-0.497126,0.943622,0.553581,261.22
275666,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,-1.525412,...,-0.430022,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93
275667,0.523820,1.531708,-4.176390,3.584615,-1.023954,-0.502471,-1.891966,0.878417,-1.541942,-2.649406,...,0.621804,0.851859,1.176927,0.453553,0.485211,-0.500687,-0.108284,0.269477,-0.063245,130.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550375,-5.766879,-8.402154,0.056543,6.950983,9.880564,-5.773192,-5.748879,0.721743,-1.076274,2.688670,...,2.493224,0.880395,-0.130436,2.241471,0.665346,-1.890041,-0.120803,0.073269,0.583799,0.00
550376,-1.125092,3.682876,-6.556168,4.016731,-0.425571,-2.031210,-2.650137,1.131249,-2.946890,-4.816401,...,0.452241,1.185580,1.348156,-0.053686,0.284122,-1.174469,-0.087832,0.718790,0.676216,0.76
550377,-1.739334,-1.304655,0.314103,0.053740,-0.058696,0.071260,0.694862,-0.313270,-0.649377,0.517568,...,-1.463994,-0.665172,-0.632078,-0.421176,-0.400774,-0.001640,-0.495162,0.031633,0.066280,320.00
550378,-2.830984,0.885657,1.199930,2.861292,0.321669,0.289966,1.767760,-2.451050,0.069736,3.245086,...,-1.016923,0.546589,0.334971,0.172106,0.623590,-0.527114,-0.079215,-2.532445,0.311177,104.81


In [21]:
X_train,X_test,y_train,y_test=train_test_split_data(X_resample,y_resample)
clf,score=model_build(X_train,X_test,y_train,y_test)
print(score)

0.9458646753152368


In [22]:
performance(X_test,y_test,clf)

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     55073
           1       0.97      0.92      0.94     55003

    accuracy                           0.95    110076
   macro avg       0.95      0.95      0.95    110076
weighted avg       0.95      0.95      0.95    110076



#### SMOTE

In [23]:
from imblearn.over_sampling import SMOTE

smote=SMOTE()

X_resample,y_resample=smote.fit_resample(X,y)

print("Before resample")
print(y.value_counts())
print("After resample")
print(y_resample.value_counts())

Before resample
Class
0    275190
1       473
Name: count, dtype: int64
After resample
Class
0    275190
1    275190
Name: count, dtype: int64


In [24]:
X_train,X_test,y_train,y_test=train_test_split_data(X_resample,y_resample)
clf,score=model_build(X_train,X_test,y_train,y_test)
print(score)

0.9571568734329009


In [25]:
performance(X_test,y_test,clf)

              precision    recall  f1-score   support

           0       0.93      0.98      0.96     55073
           1       0.98      0.93      0.96     55003

    accuracy                           0.96    110076
   macro avg       0.96      0.96      0.96    110076
weighted avg       0.96      0.96      0.96    110076

