In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:

train_full = pd.read_csv("../input/its-fraud/train.csv")

test_full = pd.read_csv("../input/its-fraud/test.csv")

train_full = reduce_mem_usage(train_full)

## Feature Selection

In the baseline model, we learned that the information poor features such as TransactionDT was among the most important features. We will try Boruta, a method of eliminating these features by only capture the important, interesting features that have high degree of influence on the target variable.

In [None]:
# log transformation on TransactionAMT variable 
train_full['TransactionAmt'] = np.log(train_full['TransactionAmt'])
test_full['TransactionAmt'] = np.log(test_full['TransactionAmt'])

In [None]:
# create lists to store categorical variables
v_features = ["V"+str(i) for i in np.arange(1, 340, 1)]
C_cols = ["C"+str(i) for i in np.arange(1, 15, 1)]
card_cols = ["card"+str(i) for i in np.arange(1, 7, 1)]
D_cols = ["D"+str(i) for i in np.arange(1, 16, 1)]
addr_cols = ["addr"+str(i) for i in np.arange(1, 3, 1)]
id_cols = ["id_"+str(i) for i in np.arange(12, 39, 1)]

In [None]:
# create a function to perform pca transformation to reduce the number of variables
def PCA_transform(df, cols,prefix, n_features):
    pca = PCA(n_components = n_features, random_state=101)
    pca_model = pca.fit_transform(df[cols])
    pca_df = pd.DataFrame(pca_model)
    df.drop(cols, axis=1, inplace=True)
    pca_df.rename(columns=lambda x: str(prefix)+str(x), inplace=True)
    df = pd.concat([df, pca_df], axis=1)
    return df

In [None]:
# since pca does not accept NA values, we will fill na with -1 
# before pca transformation the data need to be scaled from 0 to 1 
def fill_na_features (df,features):
    for col in features:
        df[col] = df[col].fillna((df[col].min() - 1))
        df[col] = (minmax_scale(df[col], feature_range=(0,1)))
    return df

In [None]:
merged_train_df = train_full
merged_test_df = test_full

In [None]:
merged_train_df = fill_na_features(merged_train_df,v_features)
merged_test_df = fill_na_features(merged_test_df,v_features)

In [None]:
# perform pca transformation which holds 95% of variance of v_features
merged_train_df = PCA_transform(merged_train_df, v_features, 'PCA_V',20)
merged_test_df = PCA_transform(merged_test_df, v_features, 'PCA_V',20)

In [None]:
# create list to store M1-M9 variable names
M_cols = ["M"+str(i) for i in np.arange(1, 10, 1)]

In [None]:
cat_cols1 = [card_cols,addr_cols,M_cols,id_cols]
cat_cols2 = ['ProductCD','P_emaildomain','R_emaildomain','DeviceType','DeviceInfo']

In [None]:
# create a function to convert the categorical variable's categories into numbers
def convert_cat_label1(df):
    for i in range(len(cat_cols1)):
        for col in cat_cols1[i]:
            # avoid nan
            if df[col].dtype=='object':
                le = preprocessing.LabelEncoder()
                le.fit(list(df[col].values) + list(df[col].values))
                df[col] = le.transform(list(df[col].values))
    return df

In [None]:
# create a function to convert the categorical variable's categories into numbers
def convert_cat_label2(df):
    for col in cat_cols2:
        if col in df.columns:
            le = preprocessing.LabelEncoder()
            le.fit(list(df[col].values) + list(df[col].values))
            df[col] = le.transform(list(df[col].values))
    return df

In [None]:
# convert categorical variables's categories into numbers 
merged_train_df = convert_cat_label1(merged_train_df)
merged_train_df = convert_cat_label2(merged_train_df)
merged_test_df = convert_cat_label1(merged_test_df)
merged_test_df = convert_cat_label2(merged_test_df)

In [None]:
# assign indedepnt variables to X, and depdent variable isFraud to y

X_train= merged_train_df.drop(['TransactionID','TransactionDT','isFraud'],axis=1)
y_train=merged_train_df['isFraud']
X_test=merged_test_df.drop(['TransactionID','TransactionDT'],axis=1)

In [None]:
from sklearn.impute import SimpleImputer

df = pd.DataFrame(X_train)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train = df.to_numpy()
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [None]:
df = pd.DataFrame(merged_train_df)
df.to_csv('preprocessed_train.csv')

# Prediction Model

## XGBoost with Hyper-parameter Tuning

### GridsearchCV code 

In [None]:
class XGBGridSearch:
    """
    Source:
    https://www.kaggle.com/xhlulu/ieee-fraud-efficient-grid-search-with-xgboost
    """
    def __init__(self, param_grid, cv=3, verbose=0, shuffle=False, random_state=2019):
        self.param_grid = param_grid
        self.cv = cv
        self.random_state = random_state
        self.verbose = verbose
        self.shuffle = shuffle
        self.average_scores = []
        self.scores = []
        
    def fit(self, X, y):
        self._expand_params()
        self._split_data(X, y)
            
        for params in tqdm(self.param_list, disable=not self.verbose):
            avg_score, score = self._run_cv(X, y, params)
            self.average_scores.append(avg_score)
            self.scores.append(score)
        
        self._compute_best()
    
    def _run_cv(self, X, y, params):
        """
        Perform KFold CV on a single set of parameters
        """
        scores = []
        
        for train_idx, val_idx in self.splits:
            clf = xgb.XGBClassifier(**params)
            X_train, X_val = X.iloc[train_idx, :], X.iloc[val_idx, :]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            clf.fit(X_train, y_train)
            
            y_val_pred = clf.predict_proba(X_val)[:, 1]
            
            score = roc_auc_score(y_val, y_val_pred)
            scores.append(score)
            
            gc.collect()
            
        avg_score = sum(scores)/len(scores) 
        return avg_score,scores
        
    def _split_data(self, X, y):
        kf = KFold(n_splits=self.cv,shuffle=self.shuffle,)
        self.splits = list(kf.split(X, y))
        
    def _compute_best(self):
        """
        Compute best params and its corresponding score
        """
        idx_best = np.argmax(self.average_scores)
        self.best_score_ = self.average_scores[idx_best]
        self.best_params_ = self.param_list[idx_best]

    def _expand_params(self):
        """
        This method expands a dictionary of lists into
        a list of dictionaries (each dictionary is a single
        valid params that can be input to XGBoost)
        """
        keys, values = zip(*self.param_grid.items())
        self.param_list = [dict(zip(keys, v)) 
            for v in itertools.product(*values)
        ]

In [None]:
import os
import gc
import itertools

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from pprint import pprint
from tqdm import tqdm

In [None]:
X_train = pd.DataFrame(X_train)

In [None]:
param_grid = {
    'n_estimators': [500],
    'missing': [-999],
    'random_state': [2019],
    'n_jobs': [1],
    'tree_method': ['gpu_hist'],
    'max_depth': [9],
    'learning_rate': [0.048, 0.05],
    'subsample': [0.85, 0.9],
    'colsample_bytree': [0.85, 0.9],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 0.9]
}

grid = XGBGridSearch(param_grid, cv=4, verbose=1)
grid.fit(X_train, y_train)

print("Best Score:", grid.best_score_)
print("Best Params:", grid.best_params_)

In [None]:
clf = xgb.XGBClassifier(**grid.best_params_)
clf.fit(X_train, y_train)

sample_submission = pd.read_csv('../input/sample-submissions/sample_submission.csv')
sample_submission['isFraud'] = clf.predict(X_test)
sample_submission.to_csv('simple_xgboost.csv')

## XGboost without Hyper-parameter tuning

In [None]:
# use xgboost to classify whether each transaction is fraud or not
import xgboost as xgb
clf = xgb.XGBClassifier( n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=101)

In [None]:
# fit the model with x label and y label and predict X_test
clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)

In [None]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=y_preds
sample_submission.head(10)
sample_submission.to_csv('Final_submission_new.csv', index=False)
sample_submission.head(10)

### Imputing mean at NULL,NAN or Infinity type cell 

In [None]:
from sklearn.impute import SimpleImputer

df = pd.DataFrame(X_train)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train = df.to_numpy()

In [None]:
from sklearn.impute import SimpleImputer

df = pd.DataFrame(X_test)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test = df.to_numpy()

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)
#print(data)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=6, n_jobs=1, verbose=2)


clf.fit(X_train, y_train)


 
y_preds = clf.predict(X_test)

In [None]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=y_preds
sample_submission.head(10)
sample_submission.to_csv('Random_forest.csv', index=False)
sample_submission.head(10)

In [None]:
"""# accuracy on y_test 
auc = roc_auc_score(y_test, y_preds)
print('AUC: %.3f' % auc)"""

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB(priors=[0.5,0.5]) 
nb.fit(X_train, y_train)
nb_test_proba = nb.predict(X_test)


In [None]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=nb_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Naive_Bayes.csv', index=False)
sample_submission.head(10)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs = -1, class_weight = 'balanced', random_state = 3) 
lr.fit(X_train, y_train)

lr_test_proba = lr.predict(X_test)

In [None]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=lr_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Logistic_Regression.csv', index=False)
sample_submission.head(10)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt =  DecisionTreeClassifier(random_state=3, class_weight='balanced')
dt.fit(X_train, y_train)
DecisionTreeClassifier(class_weight='balanced', random_state=3)
dt_test_proba = dt.predict(X_test)

In [None]:
# merge prediction results with test transactions
sample_submission = pd.read_csv("../input/sample-submissions/sample_submission.csv")
sample_submission['isFraud']=lr_test_proba 
sample_submission.head(10)
sample_submission.to_csv('Decision_Tree.csv', index=False)
sample_submission.head(10)