In [11]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
cc_data = pd.read_csv("creditcard.csv")

In [90]:
transactionData = cc_data.drop(['Time'], axis=1)
transactionData['Amount'] = StandardScaler().fit_transform(transactionData['Amount'].values.reshape(-1, 1))


X = transactionData.drop("Class", axis=1).values
y = transactionData['Class'].values
assert(len(X) == len(y))
print(X[0], y[2])
print(y)

[-1.35980713 -0.07278117  2.53634674  1.37815522 -0.33832077  0.46238778
  0.23959855  0.0986979   0.36378697  0.09079417 -0.55159953 -0.61780086
 -0.99138985 -0.31116935  1.46817697 -0.47040053  0.20797124  0.02579058
  0.40399296  0.2514121  -0.01830678  0.27783758 -0.11047391  0.06692807
  0.12853936 -0.18911484  0.13355838 -0.02105305  0.24496426] 0
[0 0 0 ... 0 0 0]


In [4]:
X_tensor = torch.as_tensor(X)
y_tensor = torch.as_tensor(y)

In [5]:
X_tensor.shape

torch.Size([284807, 29])

In [27]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        self.layer_1 = nn.Linear(29, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x



In [28]:
model = binaryClassification().double()

In [29]:
transactionData = cc_data.drop(['Time'], axis=1)
transactionData['Amount'] = StandardScaler().fit_transform(transactionData['Amount'].values.reshape(-1, 1))

56962


In [30]:
num_epochs = 100
minibatch_size = 32
learning_rate = 1e-3

In [57]:
from pymemcache.client import base
from pymemcache import serde
from torch.utils.data.sampler import Sampler
from torch.utils.data.dataset import Dataset
import random
'''class RemoteCache:
    def __init__(self):
        self.client =  base.Client(("local_host", 11211)) # client connection gets set up with default values for now
        
    def _query_server(self, data_id):
        pass
    def sampler(self):'''
        
class RemoteCacheSampler(Sampler):
    def __init__(self, dataset):
        # not efficient but keep copy of dataset in sampler
        self.dataset = dataset
        
        # create generator, which allows us to iterate over the dataset once and only once
        seed = int(torch.empty((), dtype=torch.int64).random_().item())
        self.generator=torch.Generator()
        self.generator.manual_seed(seed)
        
    def __iter__(self):
        # should return an iterator over dataset
        for g in torch.randperm(len(train_data), generator=self.generator).tolist():
            yield g#self.dataset[g]
            
    def __len__(self):
        # returns number of rows in dataframe
        return len(self.dataset)
    
class RemoteCacheDataset(Dataset):
    def __init__(self, *tensors):
        # set client for memcached
        # this sets the port to 11211 and also crucially adds a serializer
        self.client =  base.Client(("localhost", 11211), serde=serde.pickle_serde) # client connection gets set up with default values for now
        self.shadow_cache = set()
        self.tensors = tensors
        self.size = tensors[0].size(0)
        #print(tensors[])
        x = tuple(tensor[0] for tensor in self.tensors)
        print(x[0])
        # initially seed memcached server with X number of values
        for i in range(32):
            self._write_cache(i, [tensors[0][i].tolist(), tensors[1][i].tolist()])
            self.shadow_cache.add(i)
            break
    def __getitem__(self, index):
        return self._query_cache(index)
    
    def __len__(self):
        return self.size
    
    def _query_cache(self, index):
        result = self.client.get(str(index))
        
        if result is None:
            key_to_get = str(random.sample(self.shadow_cache, 1)[0])
            print(key_to_get)
            result = self.client.get(key_to_get)
            key_to_remove = random.sample(self.shadow_cache, 1)[0]
            print(key_to_remove)
            self.shadow_cache.remove(key_to_remove)
            self.client.delete(str(key_to_remove))
            self._write_cache(index, [self.tensors[0][index].tolist(), self.tensors[1][index].tolist()])
        # result should now be in the form of a list with data as first item and output as second
        print(result)
        item = tuple([torch.as_tensor(result[0]), torch.as_tensor(result[1])])
        print(item, len(item))
        return item
    
    def _write_cache(self, index, item):
        # update remote cache.  Ideally this should be done on the server
        # not the client, however that is a limitation of using memcached
        #print(index, item)
        self.client.set(str(index), item)

In [58]:
from torch.utils.data import TensorDataset, DataLoader

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=1)

train_data = RemoteCacheDataset(X_train, y_train)
#print(type(train_data))
#print(train_data[0], type(train_data[0]))
#print(len(train_data), len(train_data[0]))
test_data = TensorDataset(X_test)

#seed = int(torch.empty((), dtype=torch.int64).random_().item())
#print(seed, type(seed))
#generator=torch.Generator()
#generator.manual_seed(seed)
#a = torch.randperm(len(train_data), generator=generator).tolist()
#print(type(a))
#print(a[:3])
#print(train_data[a[0]])
train_sampler = RemoteCacheSampler(train_data)
train_loader = DataLoader(dataset=train_data, batch_size=minibatch_size, 
                          sampler=train_sampler)#shuffle=True)

test_loader = DataLoader(dataset=test_data, batch_size=1)
#print(train_loader, type(train_loader))
#count = 0
#for X_batch, y_batch in train_loader:
#    print(count)
#    count += 1
#    print(type(X_batch), len(X_batch), X_batch)
#    break
    #print("X batch",X_batch, type(X_batch))
    #print("Y batch",y_batch, type(y_batch))

tensor([ 1.3875, -0.5478,  0.0764, -0.7393, -0.5199, -0.0260, -0.6947, -0.0521,
         0.3782,  0.4882,  0.9959, -2.7565,  1.3141,  1.8052, -0.2951, -0.6310,
         0.0288,  2.1044, -0.2361, -0.4696, -0.6583, -1.3073, -0.0132, -0.9124,
         0.1632,  0.9210, -0.0966, -0.0148, -0.2279], dtype=torch.float64)


In [124]:
print(type(X_test))

<class 'torch.Tensor'>


In [33]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), learning_rate)

NameError: name 'optim' is not defined

In [48]:
history = {}
history['train_loss'] = []
history['test_loss'] = []

In [34]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [59]:
model.train()
for e in range(1, num_epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        #X_batch, y_batch = X_batch.to("cpu"), y_batch.to("cpu")
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1).float())
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
    

#torch.save(model.state_dict(), './credit_card_model.pth')

0
0
[[1.3875249431227499, -0.547811115680964, 0.0764222561830824, -0.7393341155590071, -0.519887064137748, -0.0260074594821701, -0.694673494567611, -0.0520996497541208, 0.378186623015986, 0.488237981593603, 0.995917817719813, -2.75653789926991, 1.31405118196218, 1.80524583384508, -0.29509726139069503, -0.6309888923691421, 0.0287928947898136, 2.10443605584389, -0.236070484666988, -0.469630173196602, -0.6582985624507399, -1.30728583716026, -0.0132205226013258, -0.9124152991486489, 0.163178421017443, 0.920996811517071, -0.0965601596551647, -0.0148461215358327, -0.22788939078686865], 0]
(tensor([ 1.3875, -0.5478,  0.0764, -0.7393, -0.5199, -0.0260, -0.6947, -0.0521,
         0.3782,  0.4882,  0.9959, -2.7565,  1.3141,  1.8052, -0.2951, -0.6310,
         0.0288,  2.1044, -0.2361, -0.4696, -0.6583, -1.3073, -0.0132, -0.9124,
         0.1632,  0.9210, -0.0966, -0.0148, -0.2279]), tensor(0)) 2


ValueError: Sample larger than population or is negative

AttributeError: 'DataLoader' object has no attribute 'labels'

In [132]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        #print(X_batch)
        y_test_pred = model(X_batch[0])
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [136]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56875
           1       0.93      0.62      0.74        87

    accuracy                           1.00     56962
   macro avg       0.97      0.81      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [137]:
confusion_matrix(y_test, y_pred_list)

array([[56871,     4],
       [   33,    54]])

In [140]:
y_test[y_test == 1]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])