In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/its-fraud/train.csv
/kaggle/input/its-fraud/test.csv
/kaggle/input/its-fraud/mock_submission.csv
/kaggle/input/sample-submissions/sample_submission.csv
/kaggle/input/submission/sample_submission.csv


In [2]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train_full = pd.read_csv("../input/its-fraud/train.csv")

test_full = pd.read_csv("../input/its-fraud/test.csv")

train_full = reduce_mem_usage(train_full)

Mem. usage decreased to 484.48 Mb (67.0% reduction)


## Feature Selection

In the baseline model, we learned that the information poor features such as TransactionDT was among the most important features. We will try Boruta, a method of eliminating these features by only capture the important, interesting features that have high degree of influence on the target variable.

In [5]:
# log transformation on TransactionAMT variable 
train_full['TransactionAmt'] = np.log(train_full['TransactionAmt'])
test_full['TransactionAmt'] = np.log(test_full['TransactionAmt'])

In [6]:
# create lists to store categorical variables
v_features = ["V"+str(i) for i in np.arange(1, 340, 1)]
C_cols = ["C"+str(i) for i in np.arange(1, 15, 1)]
card_cols = ["card"+str(i) for i in np.arange(1, 7, 1)]
D_cols = ["D"+str(i) for i in np.arange(1, 16, 1)]
addr_cols = ["addr"+str(i) for i in np.arange(1, 3, 1)]
id_cols = ["id_"+str(i) for i in np.arange(12, 39, 1)]

In [7]:
# create a function to perform pca transformation to reduce the number of variables
def PCA_transform(df, cols,prefix, n_features):
    pca = PCA(n_components = n_features, random_state=101)
    pca_model = pca.fit_transform(df[cols])
    pca_df = pd.DataFrame(pca_model)
    df.drop(cols, axis=1, inplace=True)
    pca_df.rename(columns=lambda x: str(prefix)+str(x), inplace=True)
    df = pd.concat([df, pca_df], axis=1)
    return df

In [8]:
# since pca does not accept NA values, we will fill na with -1 
# before pca transformation the data need to be scaled from 0 to 1 
def fill_na_features (df,features):
    for col in features:
        df[col] = df[col].fillna((df[col].min() - 1))
        df[col] = (minmax_scale(df[col], feature_range=(0,1)))
    return df

In [9]:
merged_train_df = train_full
merged_test_df = test_full

In [10]:
merged_train_df = fill_na_features(merged_train_df,v_features)
merged_test_df = fill_na_features(merged_test_df,v_features)

In [11]:
# perform pca transformation which holds 95% of variance of v_features
merged_train_df = PCA_transform(merged_train_df, v_features, 'PCA_V',20)
merged_test_df = PCA_transform(merged_test_df, v_features, 'PCA_V',20)

In [12]:
# create list to store M1-M9 variable names
M_cols = ["M"+str(i) for i in np.arange(1, 10, 1)]

In [13]:
cat_cols1 = [card_cols,addr_cols,M_cols,id_cols]
cat_cols2 = ['ProductCD','P_emaildomain','R_emaildomain','DeviceType','DeviceInfo']

In [14]:
# create a function to convert the categorical variable's categories into numbers
def convert_cat_label1(df):
    for i in range(len(cat_cols1)):
        for col in cat_cols1[i]:
            # avoid nan
            if df[col].dtype=='object':
                le = preprocessing.LabelEncoder()
                le.fit(list(df[col].values) + list(df[col].values))
                df[col] = le.transform(list(df[col].values))
    return df

In [15]:
# create a function to convert the categorical variable's categories into numbers
def convert_cat_label2(df):
    for col in cat_cols2:
        if col in df.columns:
            le = preprocessing.LabelEncoder()
            le.fit(list(df[col].values) + list(df[col].values))
            df[col] = le.transform(list(df[col].values))
    return df

In [16]:
# convert categorical variables's categories into numbers 
merged_train_df = convert_cat_label1(merged_train_df)
merged_train_df = convert_cat_label2(merged_train_df)
merged_test_df = convert_cat_label1(merged_test_df)
merged_test_df = convert_cat_label2(merged_test_df)

## Prediction Model

In [17]:
# assign indedepnt variables to X, and depdent variable isFraud to y
X= merged_train_df.drop(['TransactionID','TransactionDT','isFraud'],axis=1)
y=merged_train_df['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_submission = merged_test_df.drop(['TransactionID','TransactionDT'],axis=1)


## XGboost

In [18]:
""" # use xgboost to classify whether each transaction is fraud or not
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=101)"""

' # use xgboost to classify whether each transaction is fraud or not\nimport xgboost as xgb\nclf = xgb.XGBClassifier( n_estimators=500,\n    max_depth=7,\n    learning_rate=0.05,\n    subsample=0.9,\n    colsample_bytree=0.9,\n    random_state=101)'

In [19]:
"""# fit the model with x label and y label and predict X_test
clf.fit(X_train,y_train)
y_preds = clf.predict_proba(X_test)"""

'# fit the model with x label and y label and predict X_test\nclf.fit(X_train,y_train)\ny_preds = clf.predict_proba(X_test)'

In [20]:
"""# accuracy on y_test 
auc = roc_auc_score(y_test, y_preds[:,1])
print('AUC: %.3f' % auc)"""

"# accuracy on y_test \nauc = roc_auc_score(y_test, y_preds[:,1])\nprint('AUC: %.3f' % auc)"

In [21]:
"""# predict the probability of each transaction is fraud or not
y_preds = clf.predict_proba(X_submission)"""

'# predict the probability of each transaction is fraud or not\ny_preds = clf.predict_proba(X_submission)'

In [22]:
"""# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=y_preds[:,1]
sample_submission.head(10)
sample_submission.to_csv('Final_submission.csv', index=False)"""

'# merge prediction results with test transactions\nsample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")\nsample_submission[\'isFraud\']=y_preds[:,1]\nsample_submission.head(10)\nsample_submission.to_csv(\'Final_submission.csv\', index=False)'

In [23]:
"""# display the first 10 rows
sample_submission.head(10)"""

'# display the first 10 rows\nsample_submission.head(10)'

In [24]:
from sklearn.impute import SimpleImputer

df = pd.DataFrame(X_train)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train = df.to_numpy()

In [25]:
from sklearn.impute import SimpleImputer

df = pd.DataFrame(X_test)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test = df.to_numpy()

In [26]:
""" sklearn.impute import SimpleImputer

df = pd.DataFrame(y_test)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
y_test = df.to_numpy()"""

' sklearn.impute import SimpleImputer\n\ndf = pd.DataFrame(y_test)\ndf.replace([np.inf, -np.inf], np.nan, inplace=True)\ny_test = df.to_numpy()'

In [27]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [28]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)
#print(data)

## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, max_depth=6, n_jobs=1, verbose=2)


clf.fit(X_train, y_train)


 
y_preds = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [30]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/submission/sample_submission.csv")
sample_submission['isFraud']=y_preds
sample_submission.head(10)
sample_submission.to_csv('Random_forest.csv', index=False)
sample_submission.head(10)

Unnamed: 0,Id,isFraud
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [31]:
# accuracy on y_test 
auc = roc_auc_score(y_test, y_preds)
print('AUC: %.3f' % auc)

AUC: 0.583


## Naive Bayes

In [32]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(priors=[0.5,0.5]) 
nb.fit(X_train, y_train)
nb_test_proba = nb.predict_proba(X_test)[:,1]


In [33]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/submission/sample_submission.csv")
sample_submission['isFraud']=nb_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Naive_Bayes.csv', index=False)
sample_submission.head(10)

Unnamed: 0,Id,isFraud
0,0,1.415841e-16
1,1,1.829081e-09
2,2,1.0
3,3,5.1172780000000006e-17
4,4,1.865773e-07
5,5,1.083386e-18
6,6,2.238385e-13
7,7,0.4939403
8,8,2.255268e-07
9,9,0.9999937


In [34]:
# accuracy on y_test 
auc = roc_auc_score(y_test, nb_test_proba )
print('AUC: %.3f' % auc)

AUC: 0.710


## Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs = -1, class_weight = 'balanced', random_state = 3) 
lr.fit(X_train, y_train)

lr_test_proba = lr.predict_proba(X_test)[:,1]

In [36]:
# accuracy on y_test 
auc = roc_auc_score(y_test, lr_test_proba )
print('AUC: %.3f' % auc)

AUC: 0.746


In [37]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/submission/sample_submission.csv")
sample_submission['isFraud']=lr_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Logistic_Regression.csv', index=False)
sample_submission.head(10)

Unnamed: 0,Id,isFraud
0,0,0.196235
1,1,0.520155
2,2,0.328302
3,3,0.199621
4,4,0.412012
5,5,0.271088
6,6,0.183502
7,7,0.597227
8,8,0.428828
9,9,0.015352


## Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier
dt =  DecisionTreeClassifier(random_state=3, class_weight='balanced')
dt.fit(X_train, y_train)
DecisionTreeClassifier(class_weight='balanced', random_state=3)
dt_test_proba = dt.predict_proba(X_test)[:,1]

In [39]:
# accuracy on y_test 
auc = roc_auc_score(y_test, dt_test_proba)
print('AUC: %.3f' % auc)

AUC: 0.743


In [40]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/submission/sample_submission.csv")
sample_submission['isFraud']=lr_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Decision_Tree.csv', index=False)
sample_submission.head(10)

Unnamed: 0,Id,isFraud
0,0,0.196235
1,1,0.520155
2,2,0.328302
3,3,0.199621
4,4,0.412012
5,5,0.271088
6,6,0.183502
7,7,0.597227
8,8,0.428828
9,9,0.015352
