In [19]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 


In [96]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
plt.rcParams['figure.figsize'] = 6,4
pd.set_option('display.max_columns', None)

In [22]:
fraud_df =  pd.read_csv('/content/card_transdata.csv')
fraud_df.columns

Index(['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip',
       'used_pin_number', 'online_order', 'fraud'],
      dtype='object')

In [23]:
fraud_df.shape

(1000000, 8)

In [24]:
#First thing i look for in classification tasks is check the target distribution vs the size of the dataset 
fraud_df['fraud'].value_counts(True)

#Apparently our dataset is grossly unbalanced (91.2% : 8.8%) meaning if we were to get  a classification algo. with this data, our model would perform woefully in production
#Our options include : Resampling (oversampling , undersampling )

0.0    0.912597
1.0    0.087403
Name: fraud, dtype: float64

In [25]:
fraud_df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [26]:
#Lets check for duplicate rows
fraud_df.duplicated().sum()

#We have zero duplicate rows 

0

In [27]:
#Lets check the data details and info
fraud_df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [28]:
#Lets split the dataset into two , training and test  as we do not have any test data 
# We will keep 1000 entries from each class 
fraud_count, non_fraud_count =  fraud_df['fraud'].value_counts()

fraud  =  fraud_df[fraud_df['fraud'] == 1]
non_fraud =   fraud_df[fraud_df['fraud'] == 0]


In [29]:
#Taking the first 50 entries from each category dataframe for our predcitions later on 
pred_df  =  fraud.iloc[:50, :]
predn_df=  non_fraud.iloc[:50, :]
predict_df  = pd.concat([pred_df, predn_df], axis = 0 )

In [30]:
idx  = list(predict_df.index)
fraud_df.drop(index = idx , axis =  0 ,  inplace=True )

In [31]:
x = fraud_df.drop(columns = ['fraud'], axis = 1)
y =  fraud_df['fraud']

Now we have a training dataset and  data for prediction 

Now we do the data undersampling 

In [35]:
y.shape

(999900,)

In [60]:
from imblearn.under_sampling import NearMiss
from sklearn. linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

x_train,x_test, y_train, y_test =  train_test_split(x,y, test_size =0.3 , random_state = 42)
nr=  NearMiss()
x_train_r, y_train_r = nr.fit_resample(x_train, y_train)

#Create a Pipeline for the preprocessing 

classifier =  LogisticRegression()
steps =  [ 
           ('scaler',StandardScaler()),
           ('classifier', classifier)
         ]
pipe = Pipeline(steps)

#fit data
train_data  = pipe.fit(x_train_r,y_train_r)
predictions =  pipe.predict(x_test)
print(classification_report(predictions , y_test))




              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97    266325
         1.0       0.86      0.67      0.75     33645

    accuracy                           0.95    299970
   macro avg       0.91      0.83      0.86    299970
weighted avg       0.95      0.95      0.95    299970



In [64]:
predict_df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
13,2.131956,56.372401,6.358667,1.0,0.0,0.0,1.0,1.0
24,3.803057,67.241081,1.87295,1.0,0.0,0.0,1.0,1.0
29,15.694986,175.989182,0.855623,1.0,0.0,0.0,1.0,1.0
35,26.711462,1.552008,4.603601,1.0,1.0,0.0,1.0,1.0
36,10.664474,1.565769,4.886521,1.0,0.0,0.0,1.0,1.0


In [77]:
labels =  predict_df['labels ']
df =  predict_df.drop(columns =  ['fraud'] , inplace =  True)


13    1.0
24    1.0
29    1.0
35    1.0
36    1.0
     ... 
52    0.0
53    0.0
55    0.0
56    0.0
57    0.0
Name: fraud, Length: 100, dtype: float64

In [94]:
input = (predict_df.iloc[1, :].values)
input =  input.reshape(1,-1)

In [98]:
res =  pipe.predict(input)

if res[0] == 1:
  print('This transaction is a fraudulent transaction..Flagging Now ')
else:
  print('Valid Transcation ')

This transaction is a fraudulent transaction..Flagging Now 
