<a href="https://colab.research.google.com/github/cigamod/fraud_detection/blob/master/fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# load the train and test data
train_identity=pd.read_csv("/content/drive/My Drive/fraud_detection_dataset/train_identity.csv")
train_transaction=pd.read_csv("/content/drive/My Drive/fraud_detection_dataset/train_transaction.csv")
test_identity=pd.read_csv("/content/drive/My Drive/fraud_detection_dataset/test_identity.csv")
test_transaction=pd.read_csv("/content/drive/My Drive/fraud_detection_dataset/test_transaction.csv")

In [0]:
# reduce your memory by conversion
# convert it to the low memory to fit the RAM
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [0]:
train=pd.merge(train_transaction,train_identity,how="left",on="TransactionID")
test=pd.merge(test_transaction,test_identity,how="left",on="TransactionID")

In [0]:
train=reduce_mem_usage(train)
test=reduce_mem_usage(test)

In [0]:
del train_identity
del test_identity
del train_transaction
del test_transaction

In [0]:
train.head(5)

In [0]:
category_column=['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
print("no of categorical column:",len(category_column))

In [0]:
print("Train data")
train.isna().sum()
print("Test data")
test.isna().sum()

In [0]:
#EDA
more_than_90_NA_or_same_value_train=[]
more_than_90_NA_or_same_value_test=[]
many_na_train=[]
many_na_test=[]
for col in train.columns:
    if train[col].isna().sum()/train.shape[0] >=0.90:
        many_na_train.append(col) 
for col in test.columns:
    if test[col].isna().sum()/test.shape[0]>=0.90:
        many_na_test.append(col) 
for col in train.columns:
    if train[col].value_counts(dropna=False,normalize=True).values[0] >= 0.90:
        more_than_90_NA_or_same_value_train.append(col) 
for col in test.columns:
    if test[col].value_counts(dropna=False,normalize=True).values[0]>=0.90:
        more_than_90_NA_or_same_value_test.append(col) 

In [0]:
# store the columns to be dropped separately in train and test
cols_drop_at_train=list(set(more_than_90_NA_or_same_value_train+many_na_train))
cols_drop_at_test=list(set(more_than_90_NA_or_same_value_test+many_na_test))
print("Columns to be dropped in train",len(cols_drop_at_train))
print("Columns to be dropped in test",len(cols_drop_at_test))
print("columns are @ train:",cols_drop_at_train)
print("columns are @ test:", cols_drop_at_train)

In [0]:
total_drop_cols=list(set(cols_drop_at_train+cols_drop_at_test))
print("Total number of columns to be deleted to increase your model performance",len(total_drop_cols))
print("They are:",total_drop_cols)

In [0]:
# remove the isFraud
total_drop_cols.remove('isFraud')
print("You can check that column is removed:",total_drop_cols)

In [0]:
for col in total_drop_cols:
    if col not in train.columns:
        print("Missing drop column in train",col)
    if col not in test.columns:
        print("Missing drop columns in test",col)

In [0]:
n=0
print("len",len(total_drop_cols))
for col in train.columns:
    if col in total_drop_cols:
        n+=1
print(n)

In [0]:
#columns after dropping unwanted columns
print("Total number of columns we have now",len(train.columns))

In [0]:
# plot the distribution to check it using seaborn
sns.distplot(train['TransactionDT'], hist=True, kde=True,bins=40)
sns.distplot(test['TransactionDT'],hist=True,kde=True,bins=40)
plt.title('Density Plot of  TransactionDT  in training data')
plt.xlabel(' TransactionDT')
plt.ylabel('Counts')

In [0]:
#label encoding for categorical variable
from sklearn  import preprocessing
for col in train.columns:
    if train[col].dtype=='object' :
      #  print("label encoding",col)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values) + list(test[col].values))
        train[col] =lbl.transform(list(train[col].values))
        test[col]=lbl.transform(list(test[col].values))


In [0]:
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   

# Cleaning infinite values to NaN
train = clean_inf_nan(train)
test = clean_inf_nan(test ) 

for i in train.columns:
    train[i].fillna(train[i].median(),inplace=True) 

for i in test.columns:
    test[i].fillna(test[i].median(),inplace=True)


In [0]:
# now we an split the data and train our model
X = train.drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train['isFraud']
X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1)
test = test[['TransactionID']]

In [0]:
submission=pd.DataFrame()
submission['TransactionID']=test['TransactionID']
submission['isFraud'] = 0

In [0]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV

In [0]:
clf=LogisticRegressionCV(cv=5, random_state=0).fit(X, y)

In [0]:
train_res=clf.predict_proba(X)

In [0]:
test_res=clf.predict_proba(X_test)

In [0]:
train_res=pd.DataFrame(train_res)

In [0]:
train_val = pd.concat([X, train_res.iloc[0:train_res.shape[0],1]], axis=1)

In [0]:
train_val=pd.concat([train_val,y],axis=1)

In [0]:
test_res=pd.DataFrame(test_res)

In [0]:
test_val=pd.concat([X_test,test_res.iloc[0:test_res.shape[0],1]],axis=1)

In [0]:
submission=pd.DataFrame()
submission=test
submission['isFraud']=test_val[1]
submission.head(5)

In [0]:
submission.to_csv("submission.csv")