<a href="https://colab.research.google.com/github/cseprasadpawar/Stock-Prediction-using-Regression/blob/master/Fraud_Detection_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier

In [17]:
from zipfile import ZipFile
f = ZipFile("ieee-fraud-detection.zip")
f.infolist()

[<ZipInfo filename='test_identity.csv' compress_type=deflate file_size=25797161 compress_size=3114539>,
 <ZipInfo filename='test_transaction.csv' compress_type=deflate file_size=613194934 compress_size=49628814>,
 <ZipInfo filename='train_identity.csv' compress_type=deflate file_size=26529680 compress_size=3165987>,
 <ZipInfo filename='sample_submission.csv' compress_type=deflate file_size=6080314 compress_size=1192962>,
 <ZipInfo filename='train_transaction.csv' compress_type=deflate file_size=683351067 compress_size=55058592>]

In [0]:
transaction_df = pd.read_csv(f.open("train_transaction.csv"))
identify_df = pd.read_csv(f.open("train_identity.csv"))

In [22]:
print('Importing training data...')

X = pd.read_csv(f.open("train_transaction.csv"))
X_id = pd.read_csv(f.open("train_identity.csv"))
                   
X = pd.merge(X, X_id, on='TransactionID', how='left')
del X_id

target = 'isFraud'
indexCol = 'TransactionID'
remove_features = [target]
features = [col for col in list(X) if col not in remove_features]

y = X[target]
X = X[features]

print('Done.')

Importing training data...
Done.


In [0]:
# We'll setup some preprocessing helpers.

def convertToType(dfCol, npType): # Convert to numpy type
    dfCol = dfCol.astype(npType)

def convertToFloat32(df): # Convert numeric data to float32 or int32
    for col in df.columns:
        if df[col].dtypes != np.object and df[col].dtypes != np.int32:
            convertToType(df[col], np.float32)
            
def fillCatNan(df, filler): # Fill missing string values
    for col in df.columns:
        if df[col].dtypes == np.object:
            df[col].fillna(filler, inplace=True)
            
def fillValNan(df, filler): # Fill missing numeric values
    for col in df.columns:
        if df[col].dtypes != np.object:
            df[col].fillna(0.0, inplace=True)
            
def scaleVals(df, target): # Scale values
    for col in df.columns:
        if col != target and col != indexCol and df[col].dtypes != np.object:
            scaler = RobustScaler().fit(df[col].values.reshape(-1, 1))
            df[col] = scaler.transform(df[col].values.reshape(-1, 1))

In [0]:
# For the sake of simplicity, I'll convert numbers to float32.
# Empty string values will be replaced with '<empty>'.
# Empty numeric values will be replaced with 0.0.

def preprocess(df):
    convertToType(df[indexCol], np.int32)
    fillCatNan(df ,'<empty>')
    fillValNan(df , 0.0)
    scaleVals(df, target)

In [25]:
print('Converting to float32...')

preprocess(X)

print('Done.')

Converting to float32...
Done.


In [29]:
# We'll fit on a basic train/validation split with a stock CatBoost Classifier.

print('Training...')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cat_features_indices = np.where(X.dtypes == np.object)[0]
model = CatBoostClassifier(iterations=100, depth=3, learning_rate=0.1)   
#random_state=0)
model.fit(X_train, y_train, cat_features=cat_features_indices)

print('Done. Score:', model.score(X_test, y_test) * 100.0)

del X, X_train, X_test, y, y_train, y_test

Training...
0:	learn: 0.5453201	total: 1.32s	remaining: 2m 10s
1:	learn: 0.4348084	total: 2.62s	remaining: 2m 8s
2:	learn: 0.3540927	total: 3.98s	remaining: 2m 8s
3:	learn: 0.2959035	total: 5.05s	remaining: 2m 1s
4:	learn: 0.2513983	total: 6.17s	remaining: 1m 57s
5:	learn: 0.2197692	total: 7.23s	remaining: 1m 53s
6:	learn: 0.1971286	total: 8.28s	remaining: 1m 50s
7:	learn: 0.1800250	total: 9.4s	remaining: 1m 48s
8:	learn: 0.1656034	total: 10.5s	remaining: 1m 46s
9:	learn: 0.1549794	total: 11.6s	remaining: 1m 44s
10:	learn: 0.1470837	total: 12.7s	remaining: 1m 42s
11:	learn: 0.1408929	total: 13.8s	remaining: 1m 40s
12:	learn: 0.1360189	total: 15s	remaining: 1m 40s
13:	learn: 0.1320396	total: 16.1s	remaining: 1m 39s
14:	learn: 0.1288704	total: 17.1s	remaining: 1m 37s
15:	learn: 0.1262271	total: 18.2s	remaining: 1m 35s
16:	learn: 0.1242645	total: 19.2s	remaining: 1m 33s
17:	learn: 0.1225019	total: 20.2s	remaining: 1m 32s
18:	learn: 0.1206418	total: 21.3s	remaining: 1m 30s
19:	learn: 0.119

In [30]:
# Close enough! Now lets bring in the test data set.

print('Importing testing data...')

P = pd.read_csv(f.open("test_transaction.csv"))
P_id = pd.read_csv(f.open("test_identity.csv"))
P = pd.merge(P, P_id, on='TransactionID', how='left')[features]
del P_id

preprocess(P)

print('Done.')

Importing testing data...
Done.


In [31]:
# Calculate our predictions...
print('Calculating predictions...')

params = {
    'prediction_type': 'Probability'
}

y_pred = []
i, chunksize = 0, 10000
for idx in range(0, len(P), chunksize):
    batch = P[idx:(i+1)*chunksize]
    pred = model.predict(batch, **params)
    y_pred += list(pred)
    i += 1

y_pred = np.array(y_pred)
y_pred = np.delete(y_pred, 0, axis=1).flatten() # Only keep probability of isFraud==1

print('Done.')

Calculating predictions...
Done.


In [32]:
# Save Output to submission.csv

print('Saving output...')

submission = pd.DataFrame({'TransactionID': P['TransactionID'], 'isFraud': y_pred})
submission.to_csv('submission.csv', index=False)
print(submission.head())

del P, y_pred, submission, model

print('Done.')

Saving final output...
   TransactionID   isFraud
0        3663549  0.008330
1        3663550  0.013188
2        3663551  0.011800
3        3663552  0.013767
4        3663553  0.012975
Done.
