In [1]:
# loading needed methods
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm
from random import seed,sample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve, auc,\
precision_score
from xgboost.sklearn import XGBClassifier

# loading data

data = pd.read_csv("/Users/cosmos/Documents/AIML Dataset.csv")

In [2]:
# Subsetting data according to the conclusion above
# I don't have to subset for the fraud dataset because all of their transaction types are either TRANSFER or CASH_OUT

data_new = data.copy() 

# filling feature column
data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('C'),"type1"] = "CC" 
data_new.loc[data.nameOrig.str.contains('C') & data.nameDest.str.contains('M'),"type1"] = "CM"
data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('C'),"type1"] = "MC"
data_new.loc[data.nameOrig.str.contains('M') & data.nameDest.str.contains('M'),"type1"] = "MM"



fraud = data_new[data_new["isFraud"] == 1]
valid = data_new[data_new["isFraud"] == 0]



fraud = fraud.drop('type1', 1)
valid = valid.drop('type1',1)
data_new = data_new.drop('type1',1)




valid = valid[(valid["type"] == "CASH_OUT")| (valid["type"] == "TRANSFER")]
data_new = data_new[(data_new["type"] == "CASH_OUT") | (data_new["type"] == "TRANSFER")]

wrong_orig_bal = sum(data["oldbalanceOrg"] - data["amount"] != data["newbalanceOrig"])
wrong_dest_bal = sum(data["newbalanceDest"] + data["amount"] != data["newbalanceDest"])

In [3]:
## Calculating some quantities to justify or reject some assumptions

# flatten the subsetted dataframe of floats into an array of floats
relevant_cols = data[["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]].values.flatten()
# number of observations with negative numbers
num_neg_amt = sum(n < 0 for n in relevant_cols)
# number of observations where the amount given is greater than the amount that is in the giver's account
num_amt_oldgiver = sum(data["amount"] > data["oldbalanceOrg"]) 
# number of observations where the amount received is greater than the amount that is in the receiver's account
num_amt_newreceiver = sum(data["amount"] > data["newbalanceDest"]) 

print("number of observations with negative numbers: ", num_neg_amt)
print("number of observations where the amount given is greater than the amount that is in the giver's account: "
      , num_amt_oldgiver)
print("number of observations where the amount received is greater than the amount that is in the receiver's account: "
      , num_amt_newreceiver)

number of observations with negative numbers:  0
number of observations where the amount given is greater than the amount that is in the giver's account:  4079080
number of observations where the amount received is greater than the amount that is in the receiver's account:  2661141


In [4]:
num_wrong_bal = (data["oldbalanceOrg"] - data["amount"] != data["newbalanceOrig"]) | (data["newbalanceDest"] + data["amount"] != data["newbalanceDest"])

In [5]:
# adding features errorBalanceOrg, errorBalanceDest
data_new["errorBalanceOrg"] = data_new.newbalanceOrig + data_new.amount - data_new.oldbalanceOrg
data_new["errorBalanceDest"] = data_new.oldbalanceDest + data_new.amount - data_new.newbalanceDest

# Subsetting data into observations with fraud and valid transactions:
fraud = data_new[data_new["isFraud"] == 1]
valid = data_new[data_new["isFraud"] == 0]

In [6]:
# separating transfers and cashouts for fraud accounts

fraud_transfer = fraud[fraud["type"] == "TRANSFER"]
fraud_cashout = fraud[fraud["type"] == "CASH_OUT"]

# checking if the recipient account of a fraudulent transfer was used as a sending account for cashing out 
fraud_transfer.nameDest.isin(fraud_cashout.nameOrig).any()


False

In [7]:
# getting rid of nameDest column.
# names = ["nameDest"]
# fraud = fraud.drop(names, 1)
# valid = valid.drop(names,1)
# data_new = data_new.drop(names,1)

In [8]:

flagged = data_new[data_new["isFlaggedFraud"] == 1]
flagged_correctly = sum(flagged["isFraud"] == 1)
flagged_wrongly = len(flagged) - flagged_correctly
total = flagged_correctly + flagged_wrongly
print(flagged_correctly," observations were flagged correctly and ", flagged_wrongly, \
      " observations were flagged wrongly for a total of ", total, " flagged observations.")

# how many observations where the transaction is fraudulent, the transaction is a transfer and the amount is greater 
# than 200, 000 are in the dataset
should_be_flagged = fraud[(fraud["amount"] > 200000) & (fraud["type"] == "TRANSFER")]
print("number of observations that should be flagged: ",len(should_be_flagged))

16  observations were flagged correctly and  0  observations were flagged wrongly for a total of  16  flagged observations.
number of observations that should be flagged:  2740


In [9]:
#droppin is flagged fraud
# fraud = fraud.drop("isFlaggedFraud",1)
# valid = valid.drop("isFlaggedFraud",1)
# data_new = data_new.drop("isFlaggedFraud",1)

In [10]:
dataset1 = data_new.copy()


# adding feature HourOfDay to Dataset1 
dataset1["HourOfDay"] = np.nan # initializing feature column
dataset1.HourOfDay = data_new.step % 24

In [11]:
dataset = dataset1.copy() # unchanged dataset1

In [12]:
#dataset = pd.get_dummies(dataset,prefix=['type'])

In [18]:
dataset1.head(3)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,errorBalanceOrg,errorBalanceDest,HourOfDay
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0.0,9839.64,1
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0.0,1864.28,1
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0.0,181.0,1


In [42]:
import pandas as pd
data = pd.read_csv('/Users/cosmos/Desktop/final-db-all.csv')

In [43]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,errorBalanceOrg,errorBalanceDest,HourOfDay
0,0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0.0,9839.64,1
1,1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0.0,1864.28,1


In [48]:
x = data.sample(frac=0.01)

In [50]:
x.to_csv('s.csv')

In [7]:
data = data[(data["type"] == "CASH_OUT")| (data["type"] == "TRANSFER")]
data = data[(data["type"] == "CASH_OUT") | (data["type"] == "TRANSFER")]

fraud = data[data["isFraud"] == 1]
valid = data[data["isFraud"] == 0]



In [14]:
m=valid.head(8231) #8231

In [16]:
fraud

Unnamed: 0.1,Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,errorBalanceOrg,errorBalanceDest,HourOfDay
2,2,1,TRANSFER,181.00,C1305486145,181.00,0.0,C553264065,0.00,0.00,1,0,0.0,1.810000e+02,1
3,3,1,CASH_OUT,181.00,C840083671,181.00,0.0,C38997010,21182.00,0.00,1,0,0.0,2.136300e+04,1
251,251,1,TRANSFER,2806.00,C1420196421,2806.00,0.0,C972765878,0.00,0.00,1,0,0.0,2.806000e+03,1
252,252,1,CASH_OUT,2806.00,C2101527076,2806.00,0.0,C1007251739,26202.00,0.00,1,0,0.0,2.900800e+04,1
680,680,1,TRANSFER,20128.00,C137533655,20128.00,0.0,C1848415041,0.00,0.00,1,0,0.0,2.012800e+04,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.00,339682.13,1,0,0.0,0.000000e+00,23
6362616,6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.00,0.00,1,0,0.0,6.311409e+06,23
6362617,6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0,0.0,1.000000e-02,23
6362618,6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.00,0.00,1,0,0.0,8.500025e+05,23


In [22]:
frame = ZZ
k = pd.concat(frame)

In [24]:
k.to_csv('final 16k - test.csv')

MODEL

In [64]:
import pandas as pd
data = pd.read_csv('final 16k - test.csv')

In [65]:
RandomState = 42
seed(21)


# 42 is used often due to Hitchhiker's Guide to the Galaxy, I will use a number that a far smaller group may understand.
# Not that the actual number doesn't matter and is only used to make sure results are reproducible.
# creating training and testing sets
X = data.drop(columns=["Unnamed: 0.1","Unnamed: 0","type","nameDest","isFlaggedFraud"],axis=1)
y = data.isFraud
X_train, X_test, y_train, y_test = train_test_split(X, y)
    
#Normalizing data so that all variables follow the same scale (0 to 1)
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
y=X[X["isFraud"] == 0]

In [70]:
y

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,errorBalanceOrg,errorBalanceDest,HourOfDay
8213,1,229133.94,C905080434,15325.00,0.00,5083.00,51513.44,0,213808.94,182703.50,1
8214,1,215310.30,C1670993182,705.00,0.00,22425.00,0.00,0,214605.30,237735.30,1
8215,1,311685.89,C1984094095,10835.00,0.00,6267.00,2719172.89,0,300850.89,-2401220.00,1
8216,1,110414.71,C768216420,26845.41,0.00,288800.00,2415.16,0,83569.30,396799.55,1
8217,1,56953.90,C1570470538,1942.02,0.00,70253.00,64106.18,0,55011.88,63100.72,1
...,...,...,...,...,...,...,...,...,...,...,...
16439,8,32988.98,C871614488,179408.63,146419.66,1471332.54,1795303.57,0,0.01,-290982.05,8
16440,8,31705.26,C1821329589,146419.66,114714.40,116079.59,147784.85,0,0.00,0.00,8
16441,8,321234.39,C1570596602,114714.40,0.00,4540239.71,4871037.90,0,206519.99,-9563.80,8
16442,8,135616.97,C392764495,0.00,0.00,735093.20,1165471.56,0,135616.97,-294761.39,8


In [56]:
l=X.head(1).values

In [57]:
l

array([[  1., 181., 181.,   0.,   0.,   0.,   0., 181.,   1.]])

In [38]:
X.columns

Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'errorBalanceOrg', 'errorBalanceDest', 'HourOfDay'],
      dtype='object')

In [39]:
# Trainning model

weights = (y == 0).sum() / (1.0 * (y == 1).sum()) # for unbalanced datasets, these weights are recommended
parametersXGB = {'max_depth':3,'scale_pos_weight': weights,'n_jobs':-1,\
                 'random_state':RandomState,'learning_rate':0.1}
XGB = XGBClassifier(**parametersXGB)
    
fitted_vals = XGB.fit(X_train, y_train)
 
# Predict on testing set
predictionsXGB = XGB.predict(X_test)
 
     
# # Evaluating model
# CM_XGB = confusion_matrix(y_test,predictionsXGB)
# CR_XGB = classification_report(y_test,predictionsXGB)
# fprXGB, recallXGB, thresholds_XGB = roc_curve(y_test, predictionsXGB)
# AUC_XGB = auc(fprXGB, recallXGB)
# resultsXGB = {"Confusion Matrix":CM_XGB,"Classification Report":CR_XGB,"Area Under Curve":AUC_XGB}



In [40]:
import pickle
pickle.dump(fitted_vals, open('xgboostfinal.pkl','wb'))
#loaded_model = pickle.load(open('/Users/cosmos/Desktop/I-ll_think_about_it_later/models/xgboost-full.sav', 'rb'))

In [51]:
loaded_model = pickle.load(open('/Users/cosmos/Desktop/I-ll_think_about_it_later/models/xgboostfinal.pkl', 'rb'))

In [52]:
r=loaded_model.predict(X_test)
print(r)

[0 1 1 ... 1 0 1]


In [41]:
for measure in resultsXGB:
    print(measure,": \n",resultsXGB[measure],"\n")

Confusion Matrix : 
 [[2008    0]
 [   1 2102]] 

Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2008
           1       1.00      1.00      1.00      2103

    accuracy                           1.00      4111
   macro avg       1.00      1.00      1.00      4111
weighted avg       1.00      1.00      1.00      4111
 

Area Under Curve : 
 0.9997622444127436 

