## Problem Brief : Given online transaction data, determine if transaction was fraud or a legit one.

In [None]:
#import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gc
import random

In [None]:
# function definition that is used in this kernel

def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')
            

def showMaxRC():
    pd.set_option('display.max_column',None)
    pd.set_option('display.max_row',None)

def discreteGraph(dis_cols):
    
    fig = plt.figure(figsize = (20, 15))
    
    index = 1
    for col in dis_cols:
        plt.subplot(3, 2, index)
        sns.countplot(x=col, data=train1)
        index += 1
    plt.tight_layout()
    plt.show()
    
def getCategoricalColumns(df):
    categorical_cols = []

    for i in df.columns:
        if(df[i].dtype == 'o' or  df[i].dtype =='O'):
            categorical_cols.insert(-1,i)
    return categorical_cols

        
def getNumericalColumns(df):
    numerical_cols = []
    
    for i in df.columns:
        if(df[i].dtype!='o' and df[i].dtype!='O'):
            numerical_cols.insert(-1,i)
    
    return numerical_cols

In [None]:
# loading data
df = pd.read_csv('../input/its-a-fraud/test.csv')

In [None]:
# reducing data size by changing data types  

gc.collect()
changeDType(df,False)
showMaxRC()

## Data exploration

In [None]:
# shape of the data
print(f'total {df.shape[0]} data points\ntotal {df.shape[1]} features')

In [None]:
# Bird's eye view of dataset 
df.sample(5)

**target column** <br/>1 - transaction is fraud<br/>0 - transaction is legitimate


In [None]:
df['isFraud'].value_counts()

In [None]:
sns.countplot(data=df,x='isFraud')
plt.title("Feature : isFraud")
plt.show()

In [None]:
totalFraud = (df['isFraud']==1).sum()
totalDPoints = len(df['isFraud'])

print(f'\nFraud % : {(totalFraud/totalDPoints)*100}\n\nNon Fraud % : {100 - (totalFraud/totalDPoints)*100}')

# almost 97% datapoints are Not a fraud,
# while only 3% datapoints is fraud

In [None]:
print("Almort 97% data points are legitimate transaction\nWhile only 3% datapoints are fraud\n\nData is higly imbalance")

In [None]:
# identify name of categorical and numerical cols

categorical_cols = []

for i in df.columns:
    if(df[i].dtype == 'o' or  df[i].dtype =='O'):
        categorical_cols.insert(-1,i)
        
numerical_cols = []
for i in df.columns:
    if(i not in categorical_cols):
        numerical_cols.insert(-1,i)
        

print(f'\nTotal categorical features : {len(categorical_cols)}')
print(f'\nTotal numerical features : {len(numerical_cols)}')

In [None]:
gc.collect()

**Plotting graphs for categorical data**

In [None]:
#  plot histogram for all the categorical data 

fig,axes = plt.subplots(25,figsize=(10,200))


yp = 0
for i in categorical_cols:
    y = df[i].value_counts().index
    x = df[i].value_counts()
    if(len(y)<8):
        sns.barplot(x=y,y=x,ax=axes[yp])
        yp+=1



**All the features have too many Missing values :(**

In [None]:
# null values in categorical data
df[categorical_cols].isnull().sum()

In [None]:
for i in categorical_cols:
    print('\n',i,'=====\n\nnull : ',df[i].isnull().sum())
    print(df[i].value_counts())

**features with the name M[1-9] all of them are having value like  (T/F/null) and (M0,M1,M2)**
            
**also all of them having > 49% data as null so drop them because i dont have even knowledge about what this features are**
            

**too many null values for almost all the features some of the features are not relevant at all with what our task in like**
- id_33 which is screen resolution
- Operating system which system uses
- Device type using which payment was made  etc.
            

In [None]:
gc.collect()

## Data distribution analysis and relationship inference

In [None]:
# print('Transaction Amount Range : ',df['TransactionAmt'].min() ,' -- ',df['TransactionAmt'].max())
plt.figure(figsize=(10,5))
plt.xlabel("Amount")
plt.ylabel("Count")
plt.title("Transaction amount")
plt.hist(data=df,x='TransactionAmt',bins=80,)
plt.show()

print(df['TransactionAmt'].describe(),'\n\n')

In [None]:
cards = ['card1','card3','card5','card2']

for i in cards:
    plt.figure(figsize=(5,5))
    plt.ylabel("count")
    plt.xlabel("values")
    plt.title(f"feature : {i} -- range[{df[i].min()},{df[i].max()}]")
    plt.hist(data=df,x=i,bins=20)

- For some features data distribution is very skewed

In [None]:

cards = ['addr1','addr2','dist1','dist2']

for i in cards:
    plt.figure(figsize=(5,5))
    plt.ylabel("count")
    plt.xlabel("values")
    plt.title(f"feature : {i} -- range[{df[i].min()},{df[i].max()}]")
    plt.hist(data=df,x=i,bins=20)

plt.show()

In [None]:
for i in range(1,15):
    plt.figure(figsize=(5,5))
    plt.ylabel("count")
    plt.xlabel("values")
    plt.title(f"feature : C{i} -- range[{df['C'+str(i)].min()} , {df['C'+str(i)].max()}]")
    plt.hist(data=df,x="C"+str(i))
plt.show()

In [None]:
for i in range(1,15):
    plt.figure(figsize=(5,5))
    plt.ylabel("count")
    plt.xlabel("values")
    plt.title(f"feature : D{i} -- range[{df['D'+str(i)].min()} , {df['D'+str(i)].max()}]")
    plt.hist(data=df,x="D"+str(i))
plt.show()

## Relation ship between certain features and target columns visualization

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(data=df,x='ProductCD',hue='isFraud')
plt.show()

- **Product with code W is the most susceptible to fraud transaction**

In [None]:
print('Missing product code - fraud constitute out of total fraud % : ',100*(df[df['ProductCD'].isnull()]['isFraud']==1).sum()/(df['isFraud']==1).sum())
print('**************************************************')
for col in df['ProductCD'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df[df['ProductCD']==col]['isFraud']==1).sum()/(df['isFraud']==1).sum())
    print('**************************************************')



In [None]:
plt.figure(figsize=(5,5))
sns.countplot(data=df,x='card4',hue='isFraud')
plt.show()

- **Transaction made using visa card constitute majority fraud**

In [None]:
print('Missing card company - fraud constitute out of total fraud % : ',100*(df[df['card4'].isnull()]['isFraud']==1).sum()/(df['isFraud']==1).sum())
print('**************************************************')
for col in df['card4'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df[df['card4']==col]['isFraud']==1).sum()/(df['isFraud']==1).sum())
    print('**************************************************')


In [None]:
plt.figure(figsize=(5,5))
sns.countplot(data=df,x='card6',hue='isFraud')
plt.show()

In [None]:
print('Missing card type - fraud constitute out of total fraud % : ',100*(df[df['card6'].isnull()]['isFraud']==1).sum()/(df['isFraud']==1).sum())
print('**************************************************')
for col in df['card6'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df[df['card6']==col]['isFraud']==1).sum()/(df['isFraud']==1).sum())
    print('**************************************************')


In [None]:
plt.figure(figsize=(5,5))
sns.countplot(data=df,x='DeviceType',hue='isFraud')
plt.show()

In [None]:
print('Missing device type - fraud constitute out of total fraud % : ',100*(df[df['DeviceType'].isnull()]['isFraud']==1).sum()/(df['isFraud']==1).sum())
print('**************************************************')
for col in df['DeviceType'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df[df['DeviceType']==col]['isFraud']==1).sum()/(df['isFraud']==1).sum())
    print('**************************************************')




In [None]:

cols = df['P_emaildomain'].value_counts()[df['P_emaildomain'].value_counts().values > 3044].index
fig = plt.figure(figsize=(20,15))

index=1

for col in cols:
    plt.subplot(3,3,index)
    sns.countplot(data=df[df['P_emaildomain']==col],x='P_emaildomain',hue='isFraud')
    index+=1

plt.tight_layout()
plt.show()



- **gmail.com seems to be the most used emaildomain by fraudster...which can be to mask its activiy**

In [None]:
print('Missing purchaser email domain - fraud constitute out of total fraud % : ',100*(df[df['P_emaildomain'].isnull()]['isFraud']==1).sum()/(df['isFraud']==1).sum())
print('**************************************************')
for col in df['P_emaildomain'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df[df['P_emaildomain']==col]['isFraud']==1).sum()/(df['isFraud']==1).sum())
    print('**************************************************')




## Data Preprocessing and cleaning feature engineering

In [None]:
fullCorr = df.corr()

In [None]:

corr = fullCorr[1:2]

In [None]:
# del dfCopy
gc.collect()

In [None]:
# least correlated features, we have set thresold of minimum correlation 0.01>=

least_corr=[]
for col in corr.columns:
    if(abs(corr[col].values[0])<0.01) :
        least_corr.append(col)
# least_corr
        

In [None]:
fullCorr.drop('isFraud',axis=1,inplace=True)
fullCorr.drop('isFraud',axis=0,inplace=True)

In [None]:
upper = fullCorr.where(np.triu(np.ones(fullCorr.shape), k=1).astype(bool))

In [None]:
# if two columns are highly correlated then drop one of them
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
# preparing list of columns that will be left after dropping highly correlated columns

res_list = []
for i in df.columns:
    if i not in to_drop :
        res_list.append(i)

In [None]:
# dropped columns that has least correlation

res_list2 = []
for i in res_list:
    if i not in least_corr :
        res_list2.append(i)

In [None]:
df2 = df[res_list2].copy(deep=True)

In [None]:
df2.shape

**Dropped features with large no. of null values and features with least correlation**

In [None]:
df.shape

In [None]:
# count nulls in filtered data frame

count_of_null =[]
for i in df2.columns :
    if(df2[i].isnull().sum()>((30*len(df2))/100)) :
        count_of_null.append(i)

**will be dropping of features with # NaN > 30%**

In [None]:
# dropping off columns with too many nulls
for i in df2.columns:
    if(i in count_of_null):
        df2.drop(i,axis=1,inplace=True)
        

In [None]:
df2.shape

In [None]:
df.sample(4)

In [None]:
categorical_cols = []

for i in df2.columns:
    if(df2[i].dtype == 'o' or  df2[i].dtype =='O'):
        categorical_cols.insert(-1,i)

In [None]:
df2[categorical_cols].isnull().sum()

In [None]:
least_email_domain =[]
val_counts = df['P_emaildomain'].value_counts()

for i in df2['P_emaildomain'].value_counts().index:
    if(val_counts[i]<5000):
        least_email_domain.append(i)



To reduce no. of categories in email domains...mergin email occurances < 5000 in one category of **"other"**

In [None]:
# for least_email_domain define new category "other"

for i in range(df2.shape[0]):
    if(df2.iloc[i]['P_emaildomain'] in least_email_domain):
        df2.at[i,'P_emaildomain']='other'
    



In [None]:
gc.collect()

In [None]:
plt.figure(figsize=(15,10))
plt.title("Purchaser's email")
sns.countplot(data=df2,x='P_emaildomain',hue='isFraud')
plt.show()

In [None]:
print('Missing purchaser email domain - fraud constitute out of total fraud % : ',100*(df2[df2['P_emaildomain'].isnull()]['isFraud']==1).sum()/(df2['isFraud']==1).sum())
print('**************************************************')
for col in df2['P_emaildomain'].value_counts().index:
    print(col,'- fraud constitute out of total fraud % : ',100*(df2[df2['P_emaildomain']==col]['isFraud']==1).sum()/(df2['isFraud']==1).sum())
    print('**************************************************')



### Filling categorical data by randomly selecting one of the value from available unique values

In [None]:
print(df2['M6'].value_counts(),"\n\n")
print(df2['M6'].isnull().sum())

In [None]:
for i in range(df2.shape[0]):
    if(pd.isna(df2.iloc[i]['M6'])):
        df2.at[i,'M6'] = 'F' if random.randint(0,1) == 0 else 'T'
        

- **Filled M6 feature missing value with 'T' and 'F' label**

In [None]:
print(df2['M6'].value_counts(),"\n\n")
print(df2['M6'].isnull().sum())

In [None]:
df2[categorical_cols].isnull().sum()

In [None]:
pemail_domains = df2['P_emaildomain'].value_counts().index

In [None]:
print(df2['P_emaildomain'].value_counts())
print(df2['P_emaildomain'].isnull().sum())

In [None]:
for i in range(df2.shape[0]):
    if(pd.isna(df2.iloc[i]['P_emaildomain'])):
        df2.at[i,'P_emaildomain'] = pemail_domains[random.randint(0,6)]
        

In [None]:
print(df2['P_emaildomain'].value_counts())
print(df2['P_emaildomain'].isnull().sum())

- **Dropping card4 and card6 Null data because missing values are low**

In [None]:
df2.dropna(subset=['card4'],inplace=True)
df2.dropna(subset=['card6'],inplace=True)
# df2.reset_index()

In [None]:
df2[categorical_cols].isnull().sum()

In [None]:
df2.drop(axis=1,columns=['TransactionID'],inplace=True)

In [None]:
df2.drop(axis=1,columns=['addr2'],inplace=True)

In [None]:
gc.collect()

In [None]:
df2[categorical_cols].nunique()

In [None]:
numerical_cols = []

for i in df2.columns:
    if(df2[i].dtype != 'o' and  df2[i].dtype !='O'):
        numerical_cols.insert(-1,i)

In [None]:
numeric_nulls = df2[numerical_cols].isnull().sum()
numeric_nulls_index = df2[numerical_cols].isnull().sum().index

In [None]:
df2.dropna(thresh=70,inplace=True)

- **Out of 110 features if any data point is having 40 or more missing value will be dropped**

In [None]:
df2.shape

**Filled mising values with Mode of that feature**

In [None]:
df.info()

In [None]:
for i in numeric_nulls_index:
    if(df[i].dtype=='int32' or df[i].dtype=='int8' or df[i].dtype=='int16' or df[i].dtype=='int64'):
        df2[i].fillna(value=int(df2[i].mode()),inplace=True)

    if(df[i].dtype=='float32' or df[i].dtype=='float16' or df[i].dtype=='float64' or df[i].dtype=='float8'):
        df2[i].fillna(value=float(df2[i].mode()),inplace=True)
        

In [None]:
df2.shape

In [None]:
changeDType(df2)

**All the missing values handled**

**Scaling the features**

In [None]:
least_skewed_cols=[]
for i in numerical_cols:
    if(df2[i].skew()<1 and df2[i].skew()>-1):
        least_skewed_cols.append(i)

# for this columns i can use standardization to scale it

**Outlier Detection and removal**

In [None]:
numerical_cols = []

for i in df2.columns:
    if(df2[i].dtype != 'o' and  df2[i].dtype !='O'):
        numerical_cols.insert(-1,i)

In [None]:
train_rows = 26
train_cols = 4

# fig, ax = plt.subplots(nrows = boston_df_rows, ncols=boston_df_cols, figsize = (20,25) )
plt.figure(figsize=(25,150))
index=1
# ax= ax.flatten()
for col in numerical_cols:
    plt.subplot(train_rows,train_cols,index)
    sns.boxplot(x=df2[col])
    index += 1
plt.tight_layout(pad = 0.5,w_pad =0.7 , h_pad =5)

**Inference : Since all the data are very skewed (value distribution is very dense in some value range than other)**
**<br/><br/>Hence Inter quartile range proximity technique of outliers removal works best**

* ## Approach to solve the problem
- since data has **too many outliers** pridictive models which assign weights to the features may give biased results
- I will be using **pridictive ML models 1. Support Vector Machine 2. Logistic regression**
- We will also be using **Random forest (Tree based algorithm) as 2nd way**
- To handle imbalance in data using **stratified KFold cross validation will help mitigating overfitting**

In [None]:
## Testing

df2.shape

In [None]:
finalColumns = df2.columns

In [None]:
categoricals = getCategoricalColumns(df2)
numericals = getNumericalColumns(df2)

In [None]:
df2[categoricals].nunique()

In [None]:
df2 = pd.get_dummies(df2,columns=categoricals)

In [None]:
df2.head()

In [None]:
df2.to_csv('./preprocessesTrain.csv',index=False)

In [None]:
mydf = pd.read_csv("./preprocessesTrain.csv")

In [None]:
mydf.head()

In [None]:
gc.collect()