In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.float_format', lambda x: '%.3f' % x)

DATA_PATH = "../data/curated/dataset_scaled.csv"
BAG_META_PATH = "../data/raw/bag_meta.csv"

In [2]:
data = pd.read_csv(DATA_PATH).iloc[:, 1:] # exclude the first column as it is the index

# Take 80% of positive labels, and an equal number of negative labels
There are 499574 instances and 5475 bags with label 1

Due to computational complexity, I downsampled 50000 instances each from positive and negative bags

In [3]:
N_POSITIVE = data.loc[data.label == 1].shape[0]
N_POSITIVE_TRAIN = int(N_POSITIVE * 0.5)
print(N_POSITIVE_TRAIN)

249787


In [4]:
bag_data = pd.read_csv(BAG_META_PATH).iloc[:, 1:] # exclude the first column as it is the index

In [5]:
train_data = data.loc[data.bag_id.isin(list(bag_data.loc[bag_data.label == 1].bag_id)[:625])]
train_data = pd.concat([train_data, data.loc[data.bag_id.isin(list(bag_data.loc[bag_data.label == 0].bag_id)[:660])]], axis = 0)
train_data = train_data.sort_values(by="bag_id")
train_data = train_data.reset_index(drop=True)

In [6]:
def euclidean(p1, p2):
    """Takes in two numpy vectors p1 and p2, and returns the euclidean distance between them"""
    return np.linalg.norm(p1 - p2)

In [7]:
def bag_instance_dist(b_ind, i_ind):
    bag = train_data.loc[train_data.bag_id == b_ind]
    label = bag.iloc[0, 1] #Y
    features = bag.iloc[:,2:].values #X
    instance = train_data.iloc[i_ind, 2:].values
    distances = np.linalg.norm(features - instance, axis=1)
    return np.min(distances)

In [8]:
N_BAGS = len(train_data.bag_id.unique())
N_INSTANCES = train_data.shape[0]
distances = np.ones((N_BAGS, N_INSTANCES)) * 3 # With 9 normalized features, the maximum (Euclidean) distance between two points is 3

In [9]:
# for b, b_ind in (enumerate(list(train_data.bag_id.unique()))):
#     print("Starting iteration", b+1, "out of", len(train_data.bag_id.unique()))
#     for i_ind in trange(train_data.shape[0]):
#         distance = bag_instance_dist(b_ind, i_ind)
#         distances[b, i_ind] = distance

In [10]:
features = train_data.iloc[:,2:].values.astype(np.float32)

In [None]:
# distances = {}
# for i in trange(len(features)):
#     for j in range(i, len(features)):
#         distances[(i, j)] = np.linalg.norm(features[i] - features[j])

In [None]:
# features[:25000]

In [None]:
# np.linalg.norm(_[:, np.newaxis, :] - _[np.newaxis, :, :], axis = 2)

In [None]:
100196//25000

In [None]:
np.savetxt('250000_250000.txt', _, fmt='%d')

In [None]:
temp = np.linalg.norm(features[25000:50000][:, np.newaxis, :] - features[25000:50000][np.newaxis, :, :], axis = 2)
np.savetxt('25000-50000_25000-50000.txt',
           temp,
           fmt = '%d'
          )
del temp

In [None]:
temp = np.linalg.norm(features[50000:75000][:, np.newaxis, :] - features[50000:75000][np.newaxis, :, :], axis = 2)
np.savetxt('50000-75000_50000-75000.txt',
           temp,
           fmt = '%d'
          )
del temp

In [None]:
temp = np.linalg.norm(features[75000:100000][:, np.newaxis, :] - features[75000:100000][np.newaxis, :, :], axis = 2)
np.savetxt('75000-_75000-.txt',
           temp,
           fmt = '%d'
          )
del temp

In [None]:
np.savetxt('500000_250000.txt',
           np.linalg.norm(features[25000:][:, np.newaxis, :] - features[:25000][np.newaxis, :, :], axis = 2),
           fmt = '%d'
          )

In [None]:
np.linalg.norm(features[:, np.newaxis, :] - features[np.newaxis, :, :], axis = 2)

In [None]:
len(train_data.bag_id.unique())

In [None]:
for i in trange(50000*49999//2):
    i + 1
print("done")