In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
%matplotlib inline

paysim = pd.read_csv("paysim.csv")  

In [2]:
paysim.shape

(6362620, 11)

In [3]:
paysim.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### Create dummy variables

In [4]:
cat_list = pd.get_dummies(paysim['type'])
paysim=paysim.join(cat_list)

In [5]:
paysim.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0


## Logistic regression first Iter

In [22]:
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn import metrics
X = paysim.drop(['isFraud','nameOrig','nameDest','type'], axis=1)
y = paysim['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
len(y)

6362620

In [8]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test,y_test)))

Accuracy of logistic regression classifier on test set: 1.00


In [9]:
from sklearn.metrics import classification_report, confusion_matrix 
confusion_matrix = confusion_matrix(y_test,y_pred)
print(confusion_matrix)
print(classification_report(y_test,y_pred)) 

[[1904527    1840]
 [   1384    1035]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906367
           1       0.36      0.43      0.39      2419

   micro avg       1.00      1.00      1.00   1908786
   macro avg       0.68      0.71      0.70   1908786
weighted avg       1.00      1.00      1.00   1908786



In [10]:
yhat = y_pred
y=y_test
SS_Residual = sum((y-yhat)**2)
SS_Total = sum((y-np.mean(y))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(r_squared, adjusted_r_squared)

-0.33447331627041144 -0.33448170578545344


**% of fraud**

In [11]:
len(paysim[paysim['isFraud']==1])/paysim.shape[0]

0.001290820448180152

**% of not fraud**

In [12]:
len(paysim[paysim['isFraud']==0])/paysim.shape[0]

0.9987091795518198

### Over-sampling using SMOTE

### Note:oversampling only on the training data, none of the information in the test data is being used to create synthetic observations, therefore, no information will bleed from test data into the model training.

In [21]:
print(len(y_train))
print(len(y_test))
print(len(y))


4453834
1908786
1908786


In [25]:
#from imblearn.over_sampling import SMOTE
X = paysim.drop(['isFraud','nameOrig','nameDest','type'], axis=1)
y = paysim['isFraud']
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)
columns = X_train.columns
X,Y=os.fit_sample(X_train, y_train)
os_X= pd.DataFrame(data=X,columns=columns)
os_Y= pd.DataFrame(data=Y,columns=['isFraud'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_X))
print("Number of no subscription in oversampled data",len(os_Y[os_Y['isFraud']==0]))
print("Number of subscription",len(os_Y[os_Y['isFraud']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_Y[os_Y['isFraud']==0])/len(os_X))
print("Proportion of subscription data in oversampled data is ",len(os_Y[os_Y['isFraud']==1])/len(os_X))

length of oversampled data is  8896080
Number of no subscription in oversampled data 4448040
Number of subscription 4448040
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


## Recursive Feature Elimination[RFE]

In [27]:
from sklearn.feature_selection import RFE
logreg = LogisticRegression()
rfe = RFE(logreg, 10)
rfe = rfe.fit(os_X, os_Y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)



[ True  True  True  True  True  True False  True  True False  True  True]
[1 1 1 1 1 1 3 1 1 2 1 1]


In [28]:
os_X.columns

Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFlaggedFraud', 'CASH_IN', 'CASH_OUT', 'DEBIT',
       'PAYMENT', 'TRANSFER'],
      dtype='object')

#### DEBIT & isFlaggedFraud ranked 2&3

In [29]:
## Pvalues from logist stat

In [36]:
import statsmodels.api as sm
X = os_X
y=os_Y
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

  return 1/(1+np.exp(-X))


         Current function value: 0.099943
         Iterations: 35




                             Results: Logit
Model:                Logit              Pseudo R-squared:   0.856       
Dependent Variable:   isFraud            AIC:                1778217.0606
Date:                 2019-10-30 14:42   BIC:                1778385.0741
No. Observations:     8896080            Log-Likelihood:     -8.8910e+05 
Df Model:             11                 LL-Null:            -6.1663e+06 
Df Residuals:         8896068            LLR p-value:        0.0000      
Converged:            0.0000             Scale:              1.0000      
No. Iterations:       35.0000                                            
-------------------------------------------------------------------------
                Coef.    Std.Err.     z     P>|z|     [0.025     0.975]  
-------------------------------------------------------------------------
step             0.0030    0.0000  259.2702 0.0000      0.0030     0.0031
amount          -0.0000    0.0000 -478.1955 0.0000     -0.0000    -0

In [38]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                isFraud   No. Observations:              8896080
Model:                          Logit   Df Residuals:                  8896068
Method:                           MLE   Df Model:                           11
Date:                Wed, 30 Oct 2019   Pseudo R-squ.:                  0.8558
Time:                        14:42:39   Log-Likelihood:            -8.8910e+05
converged:                      False   LL-Null:                   -6.1663e+06
                                        LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
step               0.0030   1.17e-05    259.270      0.000       0.003       0.003
amount         -3.881e-05   8.12e-08   -478.196      0.000    -3.9e-05   -3.87e-05
oldbalanceOrg   5.322e-05    7.9e-08

#### DEBIT & isFlaggedFraud &  CASH_IN  Have no significance , p values >0.05

In [31]:
X = os_X.drop(['DEBIT','isFlaggedFraud','CASH_IN','PAYMENT','step'],axis=1)
y=os_Y
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.265754
         Iterations 12
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.617       
Dependent Variable: isFraud          AIC:              4728354.9387
Date:               2019-10-30 14:29 BIC:              4728452.9465
No. Observations:   8896080          Log-Likelihood:   -2.3642e+06 
Df Model:           6                LL-Null:          -6.1663e+06 
Df Residuals:       8896073          LLR p-value:      0.0000      
Converged:          1.0000           Scale:            1.0000      
No. Iterations:     12.0000                                        
-------------------------------------------------------------------
                   Coef.  Std.Err.     z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
amount            -0.0000   0.0000 -154.7310 0.0000 -0.0000 -0.0000
oldbalanceOrg      0.0000   0.0000  930.

In [32]:
X=os_X.drop(['DEBIT','isFlaggedFraud'],axis=1)
X_train, X_val, y_train, y_val = train_test_split(X,os_Y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred_val = logreg.predict(X_val)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_val,y_val)))

  y = column_or_1d(y, warn=True)


Accuracy of logistic regression classifier on test set: 0.92


In [33]:
from sklearn.metrics import classification_report, confusion_matrix 
confusion_matrix = confusion_matrix(y_val, y_pred_val)
print(confusion_matrix)
print(classification_report(y_val, y_pred_val)) 

[[825271  64428]
 [ 85151 804366]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92    889699
           1       0.93      0.90      0.91    889517

   micro avg       0.92      0.92      0.92   1779216
   macro avg       0.92      0.92      0.92   1779216
weighted avg       0.92      0.92      0.92   1779216



In [34]:
X_test_befor_over_sample=X_test.drop(['DEBIT','isFlaggedFraud'],axis=1)
y_test_befor_over_sample=y_test
y_pred = logreg.predict(X_test_befor_over_sample)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_befor_over_sample,y_test_befor_over_sample)))

Accuracy of logistic regression classifier on test set: 0.93


In [35]:
from sklearn.metrics import classification_report, confusion_matrix 
confusion_matrix = confusion_matrix(y_test_befor_over_sample, y_pred)
print(confusion_matrix)
print(classification_report(y_test_befor_over_sample, y_pred)) 

[[1768532  137835]
 [    249    2170]]
              precision    recall  f1-score   support

           0       1.00      0.93      0.96   1906367
           1       0.02      0.90      0.03      2419

   micro avg       0.93      0.93      0.93   1908786
   macro avg       0.51      0.91      0.50   1908786
weighted avg       1.00      0.93      0.96   1908786

