<a href="https://colab.research.google.com/github/cseprasadpawar/CyberSecurity/blob/master/Fraud_Detection_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier

In [5]:
from zipfile import ZipFile
f = ZipFile("ieee-fraud-detection.zip")
f.infolist()

[<ZipInfo filename='test_identity.csv' compress_type=deflate file_size=25797161 compress_size=3114539>,
 <ZipInfo filename='test_transaction.csv' compress_type=deflate file_size=613194934 compress_size=49628814>,
 <ZipInfo filename='train_identity.csv' compress_type=deflate file_size=26529680 compress_size=3165987>,
 <ZipInfo filename='sample_submission.csv' compress_type=deflate file_size=6080314 compress_size=1192962>,
 <ZipInfo filename='train_transaction.csv' compress_type=deflate file_size=683351067 compress_size=55058592>]

**Read csv files**

f.open is used read file within read_csv as .csv files are part of zip

In [0]:
print('Reading input data : ')

X_tr = pd.read_csv(f.open("train_transaction.csv"))
X_id = pd.read_csv(f.open("train_identity.csv"))


**Print and Check values of input file**


In [14]:
print(X_tr.shape)
print(X_id.shape)

print(X_tr.sample(5))
print(X_id.sample(5))

(590540, 394)
(144233, 41)
        TransactionID  isFraud  TransactionDT  ...  V337 V338  V339
131123        3118123        0        2593987  ...   NaN  NaN   NaN
476940        3463940        0       12345068  ...   NaN  NaN   NaN
181884        3168884        0        4030794  ...   NaN  NaN   NaN
116510        3103510        0        2250333  ...   NaN  NaN   NaN
104324        3091324        0        2072146  ...   NaN  NaN   NaN

[5 rows x 394 columns]
        TransactionID  id_01     id_02  ...  id_38  DeviceType             DeviceInfo
104632        3378353   -5.0  245969.0  ...      F     desktop                Windows
134506        3531130   -5.0  421938.0  ...      F      mobile  SM-G531H Build/LMY48B
86661         3273875   -5.0  365414.0  ...      F      mobile      Z981 Build/MMB29M
83320         3258785  -20.0  214128.0  ...      F      mobile   LG-K530 Build/MMB29M
108983        3395955  -20.0   98824.0  ...      F      mobile   SM-G955U Build/R16NW

[5 rows x 41 columns]


In [16]:
print('Merging training data : ')

                   
X = pd.merge(X_tr, X_id, on='TransactionID', how='left')
del X_tr 
del X_id

target = 'isFraud'
indexCol = 'TransactionID'
remove_features = [target]
features = [col for col in list(X) if col not in remove_features]

y = X[target]
X = X[features]

print('Done.')

Merging training data : 
Done.


In [0]:
# Preprocessing of setup

def convertToType(dfCol, npType): # Convert to numpy type
    dfCol = dfCol.astype(npType)

def convertToFloat32(df): # Convert numeric data to float32 or int32
    for col in df.columns:
        if df[col].dtypes != np.object and df[col].dtypes != np.int32:
            convertToType(df[col], np.float32)
            
def fillCatNan(df, filler): # Fill missing string values
    for col in df.columns:
        if df[col].dtypes == np.object:
            df[col].fillna(filler, inplace=True)
            
def fillValNan(df, filler): # Fill missing numeric values
    for col in df.columns:
        if df[col].dtypes != np.object:
            df[col].fillna(filler, inplace=True)
            
def scaleVals(df, target): # Scale values
    for col in df.columns:
        if col != target and col != indexCol and df[col].dtypes != np.object:
            scaler = RobustScaler().fit(df[col].values.reshape(-1, 1))
            df[col] = scaler.transform(df[col].values.reshape(-1, 1))

In [0]:
# For the sake of simplicity, I'll convert numbers to float32.
# Empty string values will be replaced with '<empty>'.
# Empty numeric values will be replaced with 0.0.

def preprocess(df):
    convertToType(df[indexCol], np.int32)
    fillCatNan(df ,'<empty>')
    fillValNan(df , 0.0)
    scaleVals(df, target)

In [19]:
print('Preprocessing input data : ')

preprocess(X)

print('Done.')

Preprocessing input data : 
Done.


In [20]:
# Train using CatBoost Classifier.

print('Training...')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cat_features_indices = np.where(X.dtypes == np.object)[0]
model = CatBoostClassifier(iterations=500, depth=3, learning_rate=0.1)   

model.fit(X_train, y_train, cat_features=cat_features_indices)

print('Done. Score:', model.score(X_test, y_test) * 100.0)

del X, X_train, X_test, y, y_train, y_test

Training...
0:	learn: 0.5406557	total: 2.26s	remaining: 18m 49s
1:	learn: 0.4332794	total: 4.25s	remaining: 17m 38s
2:	learn: 0.3520278	total: 6.43s	remaining: 17m 45s
3:	learn: 0.2952991	total: 8.2s	remaining: 16m 56s
4:	learn: 0.2521361	total: 10s	remaining: 16m 31s
5:	learn: 0.2183849	total: 11.9s	remaining: 16m 20s
6:	learn: 0.1955711	total: 13.7s	remaining: 16m 7s
7:	learn: 0.1773181	total: 15.9s	remaining: 16m 15s
8:	learn: 0.1649672	total: 17.8s	remaining: 16m 11s
9:	learn: 0.1551873	total: 19.7s	remaining: 16m 6s
10:	learn: 0.1476523	total: 21.6s	remaining: 16m
11:	learn: 0.1416188	total: 23.6s	remaining: 15m 58s
12:	learn: 0.1365326	total: 25.4s	remaining: 15m 52s
13:	learn: 0.1327219	total: 27.2s	remaining: 15m 43s
14:	learn: 0.1292893	total: 29s	remaining: 15m 39s
15:	learn: 0.1267453	total: 31s	remaining: 15m 38s
16:	learn: 0.1250234	total: 32.9s	remaining: 15m 35s
17:	learn: 0.1230396	total: 34.8s	remaining: 15m 32s
18:	learn: 0.1209757	total: 37s	remaining: 15m 36s
19:	le

In [22]:
# Read test data set 

print('Reading testing data :')

P_tr = pd.read_csv(f.open("test_transaction.csv"))
P_id = pd.read_csv(f.open("test_identity.csv"))
P = pd.merge(P_tr, P_id, on='TransactionID', how='left')[features]
del P_tr
del P_id

preprocess(P)

print('Done.')

Reading testing data :
Done.


In [23]:
# Calculate our predictions...
print('Calculating predictions : ')

params = {
    'prediction_type': 'Probability'
}

y_pred = []
i, chunksize = 0, 10000
for idx in range(0, len(P), chunksize):
    batch = P[idx:(i+1)*chunksize]
    pred = model.predict(batch, **params)
    y_pred += list(pred)
    i += 1

y_pred = np.array(y_pred)
y_pred = np.delete(y_pred, 0, axis=1).flatten() # Only keep probability of isFraud==1

print('Done.')

Calculating predictions : 
Done.


In [24]:
# Save Output to submission.csv

print('Saving output...')

submission = pd.DataFrame({'TransactionID': P['TransactionID'], 'isFraud': y_pred})
submission.to_csv('submission.csv', index=False)
print(submission.head())

del P, y_pred, submission, model

print('Done.')

Saving output...
   TransactionID   isFraud
0        3663549  0.006679
1        3663550  0.009110
2        3663551  0.009990
3        3663552  0.006348
4        3663553  0.012377
Done.
