In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

data_file='data/creditcard.csv'
data=pd.read_csv(data_file,header=0)
print(data.dtypes.head())
print('Number of records:',len(data))
data.head()

## Data preprocessing

In [None]:
from sklearn.model_selection import train_test_split
feature_names=data.columns.values[:-1]
train_test_set = data[data.Class==0][feature_names]
train_set, test_set = train_test_split(train_test_set, test_size=0.2, random_state=42)
print('Whole data set:',len(data))
print('Train and test set:',len(train_test_set))
train_set.head()

### Data preprocessing with StandardScaler

In [None]:
# from sklearn.preprocessing import MinMaxScaler, StandardScaler
# import matplotlib.pyplot as plt
# %matplotlib inline
# scaler1=MinMaxScaler().fit(train_set[['Time']])
# train_set['Time']=scaler1.transform(train_set[['Time']])
# test_set['Time']=scaler1.transform(test_set[['Time']])
# scaler2=StandardScaler().fit(train_set[['Amount']])
# train_set['Amount']=scaler2.transform(train_set[['Amount']])
# test_set['Amount']=scaler2.transform(test_set[['Amount']])

# train_set.hist(column=['Time','Amount','V1','V2'],bins=100)
# train_set.head()

In [None]:
# train_set=train_set.to_numpy()
# test_set=test.to_numpy()

### Data preprocessing using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
scaler=MinMaxScaler().fit(train_set)
train_set=scaler.transform(train_set)
test_set=scaler.transform(test_set)

pd.Series(train_set[:,0]).hist(bins=100)
pd.Series(train_set[:,1]).hist(bins=100)
pd.Series(train_set[:,2]).hist(bins=100)
pd.Series(train_set[:,-1]).hist(bins=100)
train_set[:5]

In [None]:
class autoencoder(nn.Module):
    def __init__(self,num_input):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_input, 15),
            nn.ReLU(True),
            nn.Linear(15, 7))
        self.decoder = nn.Sequential(
            nn.Linear(7, 15),
            nn.ReLU(True),
            nn.Linear(15, num_input),
            nn.Tanh())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
num_epochs = 300
batch_size = 256
lr = 0.01

inputs = torch.tensor(train_set, dtype=torch.float32)
dataset = TensorDataset(inputs)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False)

In [None]:
model = autoencoder(inputs.shape[1])
model = model.cuda() if torch.cuda.is_available() else model
criterion = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)


def test():
    model.eval()

    with torch.no_grad():
        if torch.cuda.is_available():
            tests_=torch.tensor(test_set, dtype=torch.float32).cuda()
        else:
            tests_=torch.tensor(test_set, dtype=torch.float32)
        outputs = model(tests_)
        loss=criterion(outputs,tests_)
    #print(tests_,outputs)
    return loss.item()/(tests_.shape[0]*tests_.shape[1])


for epoch in range(num_epochs):
    model.train()
    loss_sum=0.0; num=0
    for inputs, in dataloader:
        if torch.cuda.is_available():
            inputs=inputs.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)

        loss_sum+=loss.item()
        num+=(inputs.shape[0]*inputs.shape[1])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1)%5 == 0:
        print('{} epoch [{}/{}], loss:{:.6f}, test_set_loss:{:.6f}'
                .format(datetime.now(), epoch + 1, num_epochs, loss_sum/num, test()))

In [None]:
# model.eval()

# with torch.no_grad():
#     test_set2 = data[data.Class==1][feature_names]
#     test_set2['Time']=scaler1.transform(test_set2[['Time']])
#     test_set2['Amount']=scaler2.transform(test_set2[['Amount']])
#     inputs2=torch.tensor(test_set2.to_numpy(), dtype=torch.float32)
#     outputs2=model(inputs2)
#     loss2=torch.sum((inputs2-outputs2)**2,dim=1).sqrt().log()

#     test_set1=test_set.sample(n=len(loss2),random_state=42)
#     inputs1=torch.tensor(test_set1.to_numpy(), dtype=torch.float32)
#     outputs1=model(inputs1)
#     loss1=torch.sum((inputs1-outputs1)**2,dim=1).sqrt().log()

#     pd.Series(loss1.numpy()).hist(bins=100)
#     pd.Series(loss2.numpy()).hist(bins=100)
#     split_point=(loss1.max()+loss2.min())/2
#     print('Split point:',split_point)
#     print((loss1<split_point).sum().item()/float(len(loss1)))
#     print((loss2>split_point).sum().item()/float(len(loss2)))

In [None]:
model.eval()

with torch.no_grad():
    test_set2 = data[data.Class==1][feature_names]
    test_set2=scaler.transform(test_set2)
    inputs2=torch.tensor(test_set2, dtype=torch.float32)
    outputs2=model(inputs2)
    loss2=torch.sum((inputs2-outputs2)**2,dim=1).sqrt().log()

    test_set1=test_set[np.random.choice(len(test_set),size=len(loss2),replace=False)]
    inputs1=torch.tensor(test_set1, dtype=torch.float32)
    outputs1=model(inputs1)
    loss1=torch.sum((inputs1-outputs1)**2,dim=1).sqrt().log()

    pd.Series(loss1.numpy()).hist(bins=100)
    pd.Series(loss2.numpy()).hist(bins=100)
    split_point=(loss1.max()+loss2.min())/2
    print('Split point:',split_point)
    print((loss1<split_point).sum().item()/float(len(loss1)))
    print((loss2>split_point).sum().item()/float(len(loss2)))

In [None]:
torch.save(model.state_dict(), '/Users/john/projects/cloudera-ml/anomaly-detection/creditcard-fraud-2.model')