In [7]:
import heapq
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
cc_data = pd.read_csv("creditcard.csv")

In [6]:
transactionData = cc_data.drop(['Time'], axis=1)
transactionData['Amount'] = StandardScaler().fit_transform(transactionData['Amount'].values.reshape(-1, 1))


X = transactionData.drop("Class", axis=1).values
y = transactionData['Class'].values
assert(len(X) == len(y))
print(X[0], y[2])
#print(y)

np.var(X)

[-1.35980713 -0.07278117  2.53634674  1.37815522 -0.33832077  0.46238778
  0.23959855  0.0986979   0.36378697  0.09079417 -0.55159953 -0.61780086
 -0.99138985 -0.31116935  1.46817697 -0.47040053  0.20797124  0.02579058
  0.40399296  0.2514121  -0.01830678  0.27783758 -0.11047391  0.06692807
  0.12853936 -0.18911484  0.13355838 -0.02105305  0.24496426] 0


1.0941979355526301

In [12]:
X_tensor = torch.as_tensor(X)
y_tensor = torch.as_tensor(y)

In [13]:
X_tensor.shape

torch.Size([284807, 29])

In [14]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        self.layer_1 = nn.Linear(29, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x



In [18]:
model = binaryClassification().double()

In [17]:
transactionData = cc_data.drop(['Time'], axis=1)
transactionData['Amount'] = StandardScaler().fit_transform(transactionData['Amount'].values.reshape(-1, 1))

56962


In [15]:
num_epochs = 100
minibatch_size = 32
learning_rate = 1e-3

In [41]:
from pymemcache.client import base
from pymemcache import serde
from torch.utils.data.sampler import Sampler
from torch.utils.data.dataset import Dataset
import random
import heapq
        
class RemoteCacheSampler(Sampler):
    def __init__(self, dataset):
        # not efficient but keep copy of dataset in sampler
        self.dataset = dataset
        
        # create generator, which allows us to iterate over the dataset once and only once
        seed = int(torch.empty((), dtype=torch.int64).random_().item())
        self.generator=torch.Generator()
        self.generator.manual_seed(seed)
        
    def __iter__(self):
        # should return an iterator over dataset
        for g in torch.randperm(len(train_data), generator=self.generator).tolist():
            yield g
            
    def __len__(self):
        # returns number of rows in dataframe
        return len(self.dataset)
    
class RemoteCacheDataset(Dataset):
    def __init__(self, *tensors, data_variance):
        # set client for memcached
        # this sets the port to 11211 and also crucially adds a serializer
        self.client =  base.Client(("localhost", 11211), serde=serde.pickle_serde) # client connection gets set up with default values for now

        # shadow cache is a heap with the following format:
        #    [<count of uses>, <item index in original dataset>]
        self.shadow_cache = []
        self.cache_size = 32
        # this keeps the overall data variance and is used for scaling determination
        # cache variance keeps track of the current variance of the cache
        self._data_variance = data_variance
        self._cache_variance = 0
        self.tensors = tensors
        self.size = tensors[0].size(0)
        
        # initially seed memcached server with X number of values
        for i in range(self.cache_size):
            self._write_cache(i, [tensors[0][i].tolist(), tensors[1][i].tolist()])
            heapq.heappush(self.shadow_cache, [1, i])
            
        # compute cache variance - used for scaling sample from cache
        self._compute_cache_variance()
        

    def __getitem__(self, index):
        return self._query_cache(index)
    
    def __len__(self):
        return self.size
    
    def _query_cache(self, index):
        result = self.client.get(str(index))
        
        if result is None:
            # if we get a cache miss we need to perform the following operations:
            #    1) select a random index in the queue
            #    2) update the counter of the selected index to be the largest in the queue
            #    3) evict LRU item and fetch the new item - should also increase its index so it won't
            #       immediately get evicted from the cache
            #    4) compute updated cache variance
            substitue_index = random.randint(0, len(self.shadow_cache) - 1)
            
            largest_value = self.shadow_cache[self.cache_size-1][0]
            self.shadow_cache[substitue_index][0] = largest_value + 1
            # re-heapify in case we selected the least recently used item
            heapq.heapify(self.shadow_cache)
            
            # get actual data from cache and scale - how to scale??
            key_to_get = str(self.shadow_cache[substitue_index][1])
            result = self.client.get(key_to_get)
            # scale result by simple multiplication: this scale factor will likely be small given the initial
            # variance of the overall dataset
            print(result)
            result = [(np.array(result[0]) * self._cache_variance/self._data_variance).tolist(), result[1]]
            
            #key_to_remove = random.sample(self.shadow_cache, 1)[0]
            # according to heapq, this is more efficient than explicitly doing a push/pop operation
            heapq.heappushpop(self.shadow_cache, [largest_value+2, index])
            self._write_cache(index, [self.tensors[0][index].tolist(), self.tensors[1][index].tolist()])
            
            # lastly recompute updated cache variance
            self._compute_cache_variance()

        # result should now be in the form of a list with data as first item and output as second, 
        # convert to the format the model exepcts
        item = tuple([torch.as_tensor(result[0]), torch.as_tensor(result[1])])
        
        return item
    
    def _write_cache(self, index, item):
        # update remote cache.  Ideally this should be done on the server
        # not the client, however that is a limitation of using memcached
        self.client.set(str(index), item)
        
    def _compute_cache_variance(self):
        # compute the current variance of the cache
        # ideally this is something the cache server would do, however
        # again due to limitations of using memcached, we perform this on the client side
        temp_data = []
        # slow and inefficient way to compute variance, but also easist to proof out
        for i in range(self.cache_size):
            temp_data.append(self.tensors[0][i].tolist())

        temp_data = np.array(temp_data)
        self._cache_variance = np.var(temp_data)

In [42]:
from torch.utils.data import TensorDataset, DataLoader

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=1)

print(np.var(X))
train_data = RemoteCacheDataset(X_train, y_train, data_variance=np.var(X))
#print(type(train_data))
#print(train_data[0], type(train_data[0]))
#print(len(train_data), len(train_data[0]))
test_data = TensorDataset(X_test)

#seed = int(torch.empty((), dtype=torch.int64).random_().item())
#print(seed, type(seed))
#generator=torch.Generator()
#generator.manual_seed(seed)
#a = torch.randperm(len(train_data), generator=generator).tolist()
#print(type(a))
#print(a[:3])
#print(train_data[a[0]])
train_sampler = RemoteCacheSampler(train_data)
train_loader = DataLoader(dataset=train_data, batch_size=minibatch_size, 
                          sampler=train_sampler)#shuffle=True)

test_loader = DataLoader(dataset=test_data, batch_size=1)
#print(train_loader, type(train_loader))
#count = 0
#for X_batch, y_batch in train_loader:
#    print(count)
#    count += 1
#    print(type(X_batch), len(X_batch), X_batch)
#    break
    #print("X batch",X_batch, type(X_batch))
    #print("Y batch",y_batch, type(y_batch))

1.0941979355526301


In [124]:
print(type(X_test))

<class 'torch.Tensor'>


In [33]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), learning_rate)

NameError: name 'optim' is not defined

In [48]:
history = {}
history['train_loss'] = []
history['test_loss'] = []

In [34]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [43]:
model.train()
for e in range(1, num_epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        #X_batch, y_batch = X_batch.to("cpu"), y_batch.to("cpu")
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1).float())
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
    

#torch.save(model.state_dict(), './credit_card_model.pth')

[[0.0951049024200784, 0.957347058540245, -0.40883612266111796, -0.527176458477784, 0.9986340243259141, -0.8789404695889549, 0.914890166727169, -0.10177371573570099, -0.0376394047285783, -0.825499383097851, -0.714938831897266, -0.12436000518308801, 0.00370072740397579, -1.01344993760901, -0.233583579856413, 0.33699672022899896, 0.359413426547255, -0.20505670341311, -0.21852384390943103, 0.0368714625895726, -0.33767732538022494, -0.844743851886843, 0.10468184860850699, 0.5586217772858979, -0.415097596707234, 0.11380693472701801, 0.21988299283034698, 0.0855951636932077, -0.3460728186796746], 0]
[[-2.17323594869766, 0.59825691485958, 1.20957245587291, -0.328127172205285, 0.331068175451033, 1.09334617972661, 0.307360892645115, 0.651999205536677, 0.460821898200986, -0.25293244504983803, 0.34015908581545695, 0.46573647869003104, -0.7340638975475551, -0.127522231322176, 0.4541596770373621, -1.39917399492748, 1.1729441962775, -2.67690342462987, -1.99040381957445, -0.374292242466957, -0.20396196

NameError: name 'optimizer' is not defined

AttributeError: 'DataLoader' object has no attribute 'labels'

In [132]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        #print(X_batch)
        y_test_pred = model(X_batch[0])
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [136]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56875
           1       0.93      0.62      0.74        87

    accuracy                           1.00     56962
   macro avg       0.97      0.81      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [137]:
confusion_matrix(y_test, y_pred_list)

array([[56871,     4],
       [   33,    54]])

In [140]:
y_test[y_test == 1]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])