In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

fraud_train = pd.read_csv("../input/itsafraud/train.csv");
fraud_test = pd.read_csv("../input/itsafraud/test.csv");
fraud_train.head()


In [None]:
fraud_test.shape

In [None]:
fraud_train.shape

In [None]:
fraud_train.drop('TransactionID', axis = 1, inplace = True)
fraud_test.drop('TransactionID', axis = 1, inplace = True)
fraud_train.head()
#Dropping TransactionID

In [None]:
#Some columns have too many missing values where as some have none
for col in fraud_train.columns:
    print(col)
    print(fraud_train[col].isna().sum())

In [None]:
#Checking for duplicate rows
fraud_train.duplicated().sum()

In [None]:
#Removing duplicate columns
fraud_train.drop(axis="rows", labels= fraud_train.index[fraud_train.duplicated()], inplace= True)
fraud_test.drop(axis="rows", labels= fraud_train.index[fraud_train.duplicated()], inplace= True)
fraud_train.shape

In [None]:
#Checking the distribution of the target values.
#So the data is extremely imbbalanced.
sns.countplot(x='isFraud', data= fraud_train)

In [None]:
plt.hist(fraud_train['TransactionDT'], label='train');
plt.hist(fraud_test['TransactionDT'], label='test');
plt.legend();
plt.title('Distribution of transactiond dates');

#This shows that transaction Date for test and train data are same

In [None]:
fraud_train.hist(column = 'TransactionAmt', bins = 30)
#The data is extremely skewed

In [None]:
fraud_train['TransactionAmt'] = np.log10(fraud_train['TransactionAmt']+1)
fraud_test['TransactionAmt'] = np.log10(fraud_test['TransactionAmt']+1)

In [None]:
fraud_train.hist(column = 'TransactionAmt', bins=30)

In [None]:
ax= sns.countplot(x='ProductCD',hue='isFraud', data= fraud_train)
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+ 0.1, p.get_height()+ 50))

In [None]:
fraud_train['ProductCD'].value_counts()

In [None]:
#fraud_train.hist(column = 'card1', bins=60)
#sns.countplot(x='card1', data= fraud_train)
#sns.histplot(x='card1',hue='isFraud', data=fraud_train, kde=True)

In [None]:
#sns.histplot(x='card2',hue='isFraud', data=fraud_train, kde=True)


In [None]:
#sns.histplot(x='card3',hue='isFraud', data=fraud_train, kde=True)


In [None]:
ax= sns.countplot(x='card4',hue='isFraud', data= fraud_train)


In [None]:
#sns.histplot(x='card5',hue='isFraud', data=fraud_train, kde=True)


In [None]:
#ax= sns.countplot(x='card6',hue='isFraud', data= fraud_train)

In [None]:
sns.histplot(x='addr1',hue='isFraud', data=fraud_train, kde=True)

In [None]:
sns.histplot(x='addr2',hue='isFraud', data=fraud_train, kde=True)
#Almost 99% of the transactions had the same value for addr2 feature which helped in concluding that this
#feature corresponds to the Country Code and most of the transactions belong to the same country

In [None]:
fraud_train['P_emaildomain'] = fraud_train['P_emaildomain'].fillna('NaN')
fraud_test['P_emaildomain'] = fraud_test['P_emaildomain'].fillna('NaN')
plt.figure(figsize=(15, 13))
chart = sns.countplot(x='P_emaildomain',hue='isFraud', data=fraud_train)
chart.set_xticklabels(chart.get_xticklabels(), rotation=90, ha='center')

In [None]:
fraud_train['R_emaildomain'] = fraud_train['R_emaildomain'].fillna('NaN')
fraud_test['R_emaildomain'] = fraud_test['R_emaildomain'].fillna('NaN')
plt.figure(figsize=(15, 13))
chart = sns.countplot(x='R_emaildomain',hue='isFraud', data=fraud_train)
chart.set_xticklabels(chart.get_xticklabels(), rotation=90, ha='center')
#Majority of the transactions did not have the R_emaildomain value and the reason for this was the fact that not every
#transaction needed a transaction receipt and hence no information about the Receiver was present.

In [None]:
fraud_train.iloc[:, 45:54].head()

In [None]:
fraud_test.iloc[:, 44:53].head()

In [None]:
print(fraud_train.M1.unique())
print(fraud_train.M2.unique())
print(fraud_train.M3.unique())
print(fraud_train.M4.unique())
print(fraud_train.M5.unique())
print(fraud_train.M6.unique())
print(fraud_train.M7.unique())
print(fraud_train.M8.unique())
print(fraud_train.M9.unique())

In [None]:
m_features = fraud_train.iloc[:, 45:54]
m_features.describe()

In [None]:
m_featuresTest = fraud_test.iloc[:, 44:53]
m_featuresTest.describe()

In [None]:
#m_features['M1'] = m_features['M1'].fillna('Missing')
#sns.countplot(x='M1', data= m_features)

In [None]:
#m_features['M2'] = m_features['M2'].fillna('Missing')
#sns.countplot(x='M2', data= m_features)

In [None]:
m_features['M3'] = m_features['M3'].fillna('Missing')
m_featuresTest['M3'] = m_featuresTest['M3'].fillna('Missing')
sns.countplot(x='M3', hue='isFraud' ,data= fraud_train)

In [None]:
m_features['M4'] = m_features['M4'].fillna('Missing')
m_featuresTest['M4'] = m_featuresTest['M4'].fillna('Missing')
sns.countplot(x='M4', hue= 'isFraud',data= fraud_train)

In [None]:
#m_features['M5'] = m_features['M5'].fillna('Missing')
#sns.countplot(x='M5', data= m_features)

In [None]:
#m_features['M6'] = m_features['M6'].fillna('Missing')
#sns.countplot(x='M6', data= m_features)

In [None]:
#m_features['M7'] = m_features['M7'].fillna('Missing')
#sns.countplot(x='M7', data= m_features)

In [None]:
#m_features['M8'] = m_features['M8'].fillna('Missing')
#sns.countplot(x='M8', data= m_features)

In [None]:
#m_features['M9'] = m_features['M9'].fillna('Missing')
#sns.countplot(x='M9', data= m_features)

In [None]:
#M1 to M9 have Values T, F or NaN. Except for M4 which has values such as M0, M1 and M2. We can make aggregate features for these binary values columns
#Features M2 and M6 have less % of missing values than the other M features.
#Except M4, all the other features missing value has the higest fraud percentage

In [None]:
#We checked the data in C-features and had an intuition that it might be related
#So we checked the correlation matrix
plt.figure(figsize=(10,10))
c_features = fraud_train.iloc[:, 15:29]
corr_mat = c_features.corr()
sns.heatmap(corr_mat,annot = True , linewidth = 0.2)

In [None]:
fraud_train.columns[fraud_train.isnull().mean() < 0.75]

In [None]:
#D6,D7,D8,D9,D12,D13 and D14 have a lot of missing and hence dropped
#Most of the V_features have more than 75% null values which will be removed
#Device Type and Device info also removed
#Id features are also removed

In [None]:
fraud_train = fraud_train[fraud_train.columns[fraud_train.isnull().mean() < 0.75]]

In [None]:
fraud_train.drop('R_emaildomain', inplace = True, axis=1)

In [None]:
c_features = ['C2', 'C4', 'C6', 'C7', 'C8', 'C10', 'C11', 'C12', 'C14']
fraud_train.drop(c_features, inplace = True, axis=1)

In [None]:
fraud_train.shape

In [None]:
catf = ['ProductCD', 'card4','card6', 'addr1', 'addr2', 'P_emaildomain', 'M1', 'M2','M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

numf = [f for f in fraud_train.columns if (f not in catf)]

In [None]:
fraud_train.describe()

In [None]:
# fill numeric NAs with median
median = fraud_train[numf].median() 
fraud_train[numf] = fraud_train[numf].fillna(median)

# fill categorical NAs with "missing"
fraud_train[catf] = fraud_train[catf].fillna("missing")

fraud_train.isna().sum().sum()

In [None]:
# fill numeric NAs with median
numf = [f for f in fraud_test.columns if (f not in catf)]
median = fraud_test[numf].median() 
fraud_test[numf] = fraud_test[numf].fillna(median)

# fill categorical NAs with "missing"
fraud_test[catf] = fraud_test[catf].fillna("missing")

fraud_test.isna().sum().sum()

In [None]:
objlist = fraud_train.select_dtypes(include = "object").columns
objlist

In [None]:
fraud_train.head()

In [None]:
label_encoder = LabelEncoder()

for feat in objlist:
    fraud_train[feat] = label_encoder.fit_transform(fraud_train[feat].astype(str))

In [None]:
label_encoder = LabelEncoder()

for feat in objlist:
    fraud_test[feat] = label_encoder.fit_transform(fraud_test[feat].astype(str))

In [None]:
fraud_train.head()

In [None]:
for col in fraud_train.columns:
    print(col)

In [None]:
kept_features = list(fraud_train)
drop_features = list(set(list(fraud_test)) - set(kept_features))
fraud_test.drop(drop_features, inplace = True, axis=1)

In [None]:
for col in fraud_test.columns:
    print(col)

In [None]:
train_X = fraud_train.drop(axis="columns", labels="isFraud")
train_Y = fraud_train["isFraud"]


test_X  = fraud_test

In [None]:
train_X.shape

In [None]:
train_Y.shape

In [None]:
fraud_train.shape

In [None]:
fraud_test.shape

In [None]:

test_X = fraud_test

In [None]:
import sklearn
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

In [None]:
def predict_and_save(prediction, name):
    
    '''
        Utility Function to save the test data predictions locally.
    '''

    df = pd.DataFrame({'Id':test_ids.reshape(-1), 'isFraud':prediction.reshape(-1)})
    df = df.sort_values('Id')
    df.to_csv(name, index=False)

In [None]:
ids = list(range(0, 147635))
test_ids = np.array(ids)

In [None]:
test_ids.shape

In [None]:
nb = GaussianNB(priors=[0.5,0.5]) 
nb.fit(train_X, train_Y)
nb_test_proba = nb.predict_proba(test_X)[:,1]
predict_and_save(nb_test_proba, 'nb_pred_bl.csv')

In [None]:
lr = LogisticRegression(n_jobs = -1, class_weight = 'balanced', random_state = 3) 
lr.fit(train_X, train_Y)

lr_test_proba = lr.predict_proba(test_X)[:,1]
predict_and_save(lr_test_proba, 'lr_pred_bl.csv')

In [None]:
rf =  RandomForestClassifier(n_jobs = -1, class_weight = 'balanced', random_state = 3)
rf.fit(train_X, train_Y)

rf_test_proba = rf.predict_proba(test_X)[:,1]
predict_and_save(rf_test_proba, 'rf_pred_bl.csv')

In [None]:
weight = train_Y.value_counts()[0]/train_Y.value_counts()[1]
xgboost =  XGBClassifier(scale_pos_weight = weight, objective='binary:logistic', eval_metric = 'auc', random_state = 3, tree_method = 'gpu_hist')
xgboost.fit(train_X, train_Y)

xgboost_test_proba = xgboost.predict_proba(test_X)[:,1]
predict_and_save(xgboost_test_proba, 'xgboost_pred_bl.csv')