In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
from matplotlib import pyplot as plt
%matplotlib inline

# Load Data

train_id = pd.read_csv('./train_identity.csv')
test_id = pd.read_csv('./test_identity.csv')

train_trans = pd.read_csv('./train_transaction.csv')
test_trans = pd.read_csv('./test_transaction.csv')

# Load Data (Kaggle)

In [None]:
train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

train_trans = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_trans = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

In [None]:
data_id = train_id
data_tr = train_trans

test_tr = test_trans

In [None]:
data_id.head()

In [None]:
data_tr.head()

# Exploration Data Analysis

In [None]:
data_tr.dtypes.value_counts()

In [None]:
data_id.dtypes.value_counts()

In [None]:
data_id.dtypes

In [None]:
n_frauds = data_tr.isFraud.sum()
fraud_perc = data_tr.isFraud.sum()/len(data_tr.isFraud)

print('Transactions: {}'.format(len(data_tr)))
print('Frauds:        {}'.format(n_frauds))
print()
print('The {:.1f}% of transactions are fraud'.format(fraud_perc*100))

## Missing Values

In [None]:
def dropFeatures(data, threshold):
    
    null_data = data.isnull().sum()
    null_data_perc = null_data / len(data)
    
    null_data_perc = null_data_perc.sort_values(ascending = False)
    
    null_data_drop = null_data_perc[null_data_perc > threshold]
    cols_drop = null_data_drop.index
    
    print('Dropped {}/{} features with more than {}% of nan'.format(len(cols_drop), len(data.columns), threshold))
    
    return data.drop(cols_drop, axis=1)

def checkMissingValues(data, plot=True, title='', figsize=(20,5), fontsize=20):
    null_data = data.isnull().sum()

    if(plot):
        null_data.plot.bar(figsize=figsize, title=title, fontsize=fontsize)
        plt.show()

        (null_data/len(data)).plot.bar(figsize=figsize,fontsize=fontsize)
        plt.show()
    
    return (null_data/len(data)).sort_values(ascending=False)

def checkValuesFeatures(series, title='', plot=True, figsize=(20,5), fontsize=20):
    
    limit = int(30)
    
    count_features = series.value_counts()/len(series)
    count_features = count_features.sort_values(ascending=False)
    
    if(plot):

        if(len(count_features) > limit):
            title = title + ' --- (over 30 different values)'
            count_features.iloc[:limit].plot.bar(title=title, figsize=figsize, fontsize=fontsize)
            plt.show()

        else:
            count_features.plot.bar(title=title, figsize=figsize, fontsize=fontsize)
            plt.show()

    return count_features

In [None]:
checkMissingValues(data_id, 'NaN IDs')

In [None]:
checkMissingValues(data_tr, 'NaN Transactions')

### Dropping Feature with High Percentage of Missing Values

In [None]:
ids = dropFeatures(data_id, threshold=0.5)

In [None]:
nan_ids = checkMissingValues(ids, 'NaN IDs')

In [None]:
trs = dropFeatures(data_tr, threshold=0.5)

In [None]:
nan_trs = checkMissingValues(trs, 'NaN Transactions')

### Imputation of the rest

#### ID imputation

In [None]:
nan_ids[:10]

In [None]:
threshold = 0.2
cols_nan = nan_ids[nan_ids > threshold].index
ids[cols_nan].head()

In [None]:
for col in cols_nan:
    
    feat, name = ids[col], str(col)

    checkValuesFeatures(feat, name)

- Imputation

##### Numerical Features

In [None]:
num_ids = ids.select_dtypes(include=['float', 'int'])
num_col_ids = num_ids.columns
num_col_ids

num_nan = checkMissingValues(num_ids, 'Numerical Ids')

In [None]:
nan_col = num_nan[num_nan > 0].index

for col in nan_col:
    mean = (ids[col]).mean()
    
    ids[col] = ids[col].fillna(int(mean))

In [None]:
ids.head()

##### Categorical Features

In [None]:
cat_ids = ids.select_dtypes(include=['object'])
cat_col_ids = cat_ids.columns
cat_col_ids

cat_nan = checkMissingValues(cat_ids, 'Categorical Ids')

In [None]:
nan_col = cat_nan[cat_nan > 0].index

for col in nan_col:
    
    count_feat = checkValuesFeatures(ids[col], plot=False)
    value = count_feat.index[0]
    #print(value)
    ids[col] = ids[col].fillna(value)

In [None]:
ids.head()

#### Transactions Imputation

In [None]:
nan_trs[0:10]

In [None]:
threshold = 0.2
cols_nan = nan_trs[nan_trs > threshold].index
trs[cols_nan].tail(10)

##### Numerical Features

In [None]:
num_trs = trs.select_dtypes(include=['float', 'int'])
num_col_trs = num_trs.columns
num_col_trs

num_nan = checkMissingValues(num_trs, 'Numerical Transctions')

In [None]:
nan_col = num_nan[num_nan > 0].index

for col in nan_col:
    mean = (trs[col]).mean()
    
    trs[col] = trs[col].fillna(int(mean))
    test_tr[col] = test_tr[col].fillna(int(mean))

##### Categorical Features

In [None]:
cat_trs = trs.select_dtypes(include=['object'])
cat_col_trs = cat_trs.columns
cat_col_trs

cat_nan = checkMissingValues(cat_trs, 'Categorical Transactions')

In [None]:
nan_col = cat_nan[cat_nan > 0].index

for col in nan_col:
    
    count_feat = checkValuesFeatures(trs[col], plot=False)
    value = count_feat.index[0]
    #print(value)
    trs[col] = trs[col].fillna(value)

In [None]:
trs.head()

In [None]:
nan = checkMissingValues(ids, plot=False)
nan.sum()

In [None]:
nan = checkMissingValues(trs, plot=False)
nan.sum()

## Skewness

In [None]:
from scipy.stats import norm

In [None]:
def checkSkewness(data):
    
    skewness = data.skew().sort_values(ascending=False)
    
    return pd.DataFrame({'Skew':skewness})

### ID

In [None]:
checkSkewness(ids)

In [None]:
#sn.distplot(ids['id_10'], fit=norm)

### Transactions

In [None]:
checkSkewness(trs)

In [None]:
#sn.distplot(trs['V305'], fit=norm)

# Anomaly Detection Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
#from torchsummary import summary

## Model Functions

In [None]:
class AE_Dataset(Dataset):
    '''
        Format the training dataset to be input into the auto encoder.
        Takes in dataframe and converts it to a PyTorch Tensor
    '''
    
    def __init__(self, data):
        self.x = data
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        '''
            Returns a example from the data set as a pytorch tensor.
        '''
        # Get example/target pair at idx as numpy arrays
        x = self.x[idx]

        # Convert to torch tensor
        x = torch.from_numpy(x).type(torch.FloatTensor)
        
        return x
    
def AE_Dataloader(data, batch_size=1):
    
    dataset = AE_Dataset(data)
    
    dataloader = DataLoader(dataset    = dataset,
                            batch_size = batch_size,
                            drop_last  = True,
                            shuffle = True,
                            num_workers= 8,
                            )
    
    return dataloader

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, in_features):
        
        super(Encoder, self).__init__()
        
        # LAYER 1
        out_features = in_features // 2

        self.layer1 = nn.Linear(in_features=in_features,
                                out_features=out_features)
        self.relu1 = nn.ReLU()


        # LAYER 2
        in_features = out_features
        out_features = in_features // 2

        self.layer2 = nn.Linear(in_features=in_features,
                                out_features=out_features)
        self.relu2 = nn.ReLU()
        
        # LAYER 3
        in_features = out_features
        out_features = in_features // 2

        self.layer3 = nn.Linear(in_features=in_features,
                                out_features=out_features)
        self.relu3 = nn.ReLU()
        
    def forward(self, x):
        
        h = self.layer1(x)
        h = self.relu1(h)
        h = self.layer2(h)
        h = self.relu2(h)
        h = self.layer3(h)
        out = self.relu3(h)
        
        return out
    
class Decoder(nn.Module):
    
    def __init__(self, in_features, final_feature):
        
        super(Decoder, self).__init__()
        
        # LAYER 1
        out_features = in_features * 2

        self.layer1 = nn.Linear(in_features=in_features,
                                out_features=out_features)
        self.relu1 = nn.ReLU()


        # LAYER 2
        in_features = out_features
        out_features = in_features * 2

        self.layer2 = nn.Linear(in_features=in_features,
                                out_features=out_features)
        self.relu2 = nn.ReLU()
        
        # LAYER 3
        in_features = out_features
        out_features = in_features * 2

        self.layer3 = nn.Linear(in_features=in_features,
                                out_features=final_feature)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        
        h = self.layer1(x)
        h = self.relu1(h)
        h = self.layer2(h)
        h = self.relu2(h)
        h = self.layer3(h)
        out = self.sigmoid(h)
        
        return out

## Pre-processing

In [None]:
x_train = trs[trs['isFraud']==0]
x_train = x_train[num_col_trs]
x_train = x_train.drop(['isFraud'], axis=1)
x_train

tr_cols = x_train.columns

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

test = test_tr[tr_cols]
test = test.fillna(0)
test = scaler.transform(test)


In [None]:
dataloader = AE_Dataloader(x_train, batch_size=64)

## Autoencoder Model

In [None]:
# AUTOENCODER

class Autoencoder():

    # MODEL SETUP
    
    def __init__(self, params):
        
        self.data = params['data']
        self.loss = params['loss']
        
        self.enc = Encoder(params['in_feat']).cuda()
        self.optim_enc = params['optim'](self.enc.parameters(), params['lr'])
                
        self.dec = Decoder(params['z_size'], params['in_feat']).cuda()
        self.optim_dec = params['optim'](self.dec.parameters(), params['lr'])
    
    def forward(self, x):
        
        z = self.enc(x)
        out = self.dec(z)
        
        return out
    
    def loss_function(self, x, out):
        return self.loss(x, out)
    
    def optimize(self, loss):
        
        self.optim_enc.zero_grad()
        self.optim_dec.zero_grad()
        
        loss.backward()
        
        self.optim_enc.step()
        self.optim_dec.step()
    
    
    # TRAINING
    
    def trainOneEpoch(self):
        
        losses = []
        
        for x in self.data:
            
            x = torch.FloatTensor(x).cuda()
            
            # forward
            out = self.forward(x)
            
            # loss
            loss = self.loss_function(x, out)
            
            # backward
            self.optimize(loss)
            
            
            losses.append(loss.item())
            
        return np.mean(losses)
    
    
    def train(self, epochs, step=5):
        
        losses = []
        
        for epoch in tqdm(range(epochs)):
            
            loss = self.trainOneEpoch()
            
            if(epoch % step == 0):
                print('> Epoch {}/{}:  loss = {:.4f}'.format(epoch, epochs, loss))
        
            losses.append(loss)
            
        return losses
        
    # TESTING
    
    def predict(self, x):
        
        x = torch.from_numpy(x).type(torch.FloatTensor).cuda()
        
        with torch.no_grad():
            out = self.forward(x)
        
        score = self.loss_function(x, out)
        
        return score.item()
    
    def training_scores(self):
        
        scores = []
        
        with torch.no_grad():
            
            for i in range(len(self.data.dataset)):
                
                x = self.data.dataset.x[i]
                x = torch.from_numpy(x).type(torch.FloatTensor).cuda()
                
                out = self.forward(x)
                score = self.loss_function(x, out)
                
                scores.append(score.item())
                
        return scores

In [None]:
lr = 0.001
in_features = len(trs[tr_cols].columns)

optim = torch.optim.Adam
loss = nn.MSELoss()

params = {'in_feat':in_features,
          'z_size':int(in_features//8),
          'data':dataloader,
          'optim':optim,
          'lr':lr,
          'loss':loss}

### Traning

In [None]:
ae_model = Autoencoder(params)

In [None]:
params['lr'] = 0.0001

losses = ae_model.train(20, step=1)
plt.plot(losses)
plt.show()

#### Thresholding

In [None]:
training_scores = ae_model.training_scores()

In [None]:
hist = plt.hist(training_scores, bins=50, density=True)

In [None]:
alpha = 0.96
threshold = np.quantile(training_scores, alpha)
print(threshold)

sn.distplot(training_scores)
hist = plt.hist(training_scores, bins=1000, density=True, label='histogram')
plt.plot([threshold, threshold], [0,0.05], label='threshold')
#plt.xlim(0, 5)
plt.legend()
plt.show()

### Testing

In [None]:
# anomalous samples

y_train = trs[trs.isFraud==1]
y_train = y_train[tr_cols]
y_train = scaler.transform(y_train)

frauds = 0
non_frauds = 0


for i in range(len(y_train)):
    sample = y_train[i]
    
    score = ae_model.predict(sample)
    #print(score)
    
    if(score < threshold):
        frauds += 1
    else:
        non_frauds += 1

print('> Detection:\n')
print('> Detected Frauds:      {}'.format(frauds))
#print('> Non-Frauds Detected:  {}'.format(non_frauds))
print('> Real Frauds:          {}'.format(len(y_train)))

In [None]:
print('> Detection:\n')
print('> Detected Frauds:           {}'.format(frauds))
#print('> Non-Frauds Detected:  {}'.format(non_frauds))
print('> Real Frauds:               {}'.format(len(y_train)))
print('')
print('> Accuracy: {:.3f}'.format(frauds/len(y_train)))


# Submission

In [None]:
test = test_tr[tr_cols]
test = test.fillna(0)
test = scaler.transform(test)

result = []
scores = []

for i in range(len(test)):
    sample = test[i]
    
    score = ae_model.predict(sample)
    scores.append(score)
    
    if(score < threshold):
        result.append(0.0)
    else:
        result.append(1.0)

In [None]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

sub['isFraud'] = scores

sub.to_csv('submission_fraud_detection(auc-4).csv', index=False)
sub
