In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import sklearn as skl

pd.set_option('display.float_format', lambda x: '%.3f' % x)

DATA_PATH = "../data/curated/dataset_scaled.csv"
BAG_META_PATH = "../data/raw/bag_meta.csv"

In [2]:
data = pd.read_csv(DATA_PATH).iloc[:, 1:] # exclude the first column as it is the index

# Take 80% of positive labels, and an equal number of negative labels
There are 499574 instances and 5475 bags with label 1

Edit: Due to computational complexity, I downsampled 20051 instances from positive and negative bags (around 50/50 proportion)

In [3]:
bag_data = pd.read_csv(BAG_META_PATH).iloc[:, 1:] # exclude the first column as it is the index

In [4]:
train_data = data.loc[data.bag_id.isin(list(bag_data.loc[bag_data.label == 1].bag_id)[:120])]
train_data = pd.concat([train_data, data.loc[data.bag_id.isin(list(bag_data.loc[bag_data.label == 0].bag_id)[:130])]], axis = 0)
train_data = train_data.sort_values(by="bag_id")
train_data = train_data.reset_index(drop=True)
train_data

Unnamed: 0,bag_id,label,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,0,0,0.015,0.008,0.173,0.020,0.009,0.293,0.010,0.004,0.379
1,0,0,0.042,0.015,0.179,0.085,0.011,0.269,0.059,0.009,0.389
2,0,0,0.016,0.009,0.153,0.071,0.011,0.264,0.090,0.011,0.393
3,0,0,0.045,0.012,0.187,0.049,0.011,0.254,0.016,0.008,0.382
4,0,0,0.098,0.011,0.145,0.040,0.011,0.279,0.100,0.010,0.389
...,...,...,...,...,...,...,...,...,...,...,...
13656,2776,1,0.056,0.029,0.361,0.039,0.032,0.715,0.125,0.029,0.398
13657,2776,1,0.020,0.018,0.424,0.037,0.010,0.578,0.087,0.012,0.343
13658,2776,1,0.053,0.024,0.361,0.053,0.018,0.653,0.072,0.012,0.350
13659,2776,1,0.019,0.045,0.524,0.023,0.030,0.653,0.030,0.010,0.359


In [5]:
features = train_data.iloc[:,2:].values.astype(np.float32)
distances = np.linalg.norm(features[:, np.newaxis, :] - features[np.newaxis, :, :], axis = 2)
np.savetxt("./../data/curated/distances_10000.txt", distances)

# distances = np.loadtxt("./../data/curated/distances_10000.txt")

In [6]:
N_BAGS = len(train_data.bag_id.unique())
N_INSTANCES = train_data.shape[0]

In [7]:
bag_features = np.ones((N_BAGS, N_INSTANCES)) * 3
for i, bag_id in tqdm(enumerate(list(train_data.bag_id.unique()))):
    for j in range(train_data.shape[0]):
        bag = train_data.loc[train_data.bag_id == bag_id]
        for k in bag.index:
            bag_features[i][j] = min(bag_features[i][j], distances[k][j])
np.savetxt("./../data/curated/bag_features.txt", bag_features)

# bag_features = np.loadtxt("./../data/curated/bag_features.txt")
# print(bag_features.shape)
# print(bag_features)

250it [16:28,  3.95s/it]


# Feature Selection with LinearSVC
Documentation:

* https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
* https://scikit-learn.org/stable/modules/feature_selection.html

In [8]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [9]:
# Getting the training labels for each bag
Y = train_data.loc[:, ["bag_id", "label"]].value_counts(ascending=True).reset_index().sort_values(by="bag_id").label

In [33]:
# We use the dual=False here because the dual=True has not been implemented in this package for L1 penalties
np.random.seed(40)
penalized_svm = LinearSVC(C=0.5, penalty="l1", dual=False, max_iter = 10000).fit(bag_features, Y)
model = SelectFromModel(estimator = penalized_svm, prefit=True)
selected_features = model.transform(bag_features)
print(selected_features.shape)
selected_features

(250, 13)


array([[0.23526356, 0.28061926, 0.39882559, ..., 0.42617714, 0.39967412,
        0.45504135],
       [0.10113101, 0.14165424, 0.22722259, ..., 0.39267564, 0.28088722,
        0.37729684],
       [0.24330905, 0.32073817, 0.40636903, ..., 0.45735732, 0.43132034,
        0.49946529],
       ...,
       [0.2427081 , 0.22664119, 0.19587411, ..., 0.51954061, 0.33173871,
        0.44068825],
       [0.26835781, 0.23929757, 0.05008517, ..., 0.30608976, 0.11694902,
        0.21721114],
       [0.38632751, 0.3529903 , 0.15921737, ..., 0.14272821, 0.08474394,
        0.09589792]])

In [34]:
print("New # Features:", selected_features.shape[1])
print("Was:", bag_features.shape[1])

New # Features: 13
Was: 13661


In [35]:
feature_indices = []
for j in range(selected_features.shape[1]):
    for i in range(bag_features.shape[1]):
        p = bag_features[:,i]
        if np.linalg.norm(p - selected_features[:,j]) <= 0.00001:
            feature_indices.append(i)
            break

In [36]:
max_diverse_density_points = train_data.iloc[feature_indices, :]
max_diverse_density_points

Unnamed: 0,bag_id,label,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
2006,50,0,0.098,0.011,0.424,0.112,0.015,0.213,0.047,0.017,0.395
4551,93,0,0.06,0.032,0.486,0.216,0.012,0.252,0.075,0.012,0.323
6610,212,1,0.085,0.024,0.549,0.012,0.032,0.442,0.066,0.02,0.288
6734,302,1,0.005,0.034,0.662,0.005,0.007,0.33,0.056,0.01,0.339
8442,602,1,0.092,0.032,0.561,0.009,0.018,0.467,0.106,0.022,0.282
9231,787,1,0.07,0.014,0.499,0.005,0.017,0.429,0.046,0.032,0.209
10111,1004,1,0.062,0.02,0.561,0.005,0.016,0.454,0.015,0.009,0.294
10215,1096,1,0.106,0.03,0.536,0.027,0.034,0.467,0.098,0.015,0.301
10367,1148,1,0.022,0.033,0.624,0.015,0.015,0.342,0.311,0.013,0.39
10778,1265,1,0.048,0.018,0.624,0.0,0.071,0.516,0.298,0.018,0.339


In [14]:
max_diverse_density_points.to_csv("./../data/curated/max_diverse_density_points.csv")

In [44]:
np.sum(penalized_svm.predict(bag_features) == Y)

184

In [45]:
len(Y)

250

# Replicating Code from Original Paper
Source: https://github.com/johnvorsten/MILES/blob/57656bb29ccfec644351e18a316c03627769f895/src/tests/l1_svm_demo.py#L88

In [17]:
# # Globals
# N_POSITIVE_BAGS = 100
# N_NEGATIVE_BAGS = 25
# INSTANCE_SPACE = 2
# BAG_SIZE = 9
# SIGMA_EMBEDDING = 3  # Regularizer, Embedding
# GAMMA_SVC = 'scale'
# PENALTY = 'l1'  # L1 loss penalization
# LOSS = 'squared_hinge'  # Loss function
# C = 1.0  # SVM regularization, inversely proportional

In [18]:
# # Define SVM
# svmc_l1 = skl.svm.LinearSVC(
#     loss=LOSS, penalty=PENALTY, C=C, dual=False, max_iter=5000)

# # SVC Using LibSVM uses the squared l2 loss
# svmc = skl.svm.SVC(kernel='rbf', gamma=GAMMA_SVC, C=C)

# # Define grid search parameters
# params_l1svc = {'C': [0.5, 1, 2],
#                 }
# params_svc = {'C': [0.5, 1, 2],
#               'kernel': ['rbf', 'poly']}

# # Grid search
# svmc_l1_gs = skl.model_selection.GridSearchCV(estimator=svmc_l1,
#                                               param_grid=params_l1svc,
#                                               scoring=['accuracy',
#                                                        'precision', 'recall'],
#                                               refit=False,
#                                               n_jobs=6,
#                                               cv=None,  # Default 5-fold validation
#                                               )


# svmc_gs = skl.model_selection.GridSearchCV(estimator=svmc,
#                                            param_grid=params_svc,
#                                            scoring=['accuracy',
#                                                     'precision', 'recall'],
#                                            n_jobs=6,
#                                            refit=False,
#                                            cv=None,  # Default 5-fold validation
#                                            )

In [19]:
# # Estimators expect (instance,features). embedded bags are encoded where
# # features are along axis=1
# svmc_l1_gs.fit(bag_features, Y)
# svmc_gs.fit(bag_features, Y)

In [20]:
# # Print results
# print("L1 SVM Results:")
# print(svmc_l1_gs.cv_results_)
# print()
# print("rbf, polynomial SVM Results: ")
# print(svmc_gs.cv_results_)

# Archived Code

In [21]:
# def euclidean(p1, p2):
#     """Takes in two numpy vectors p1 and p2, and returns the euclidean distance between them"""
#     return np.linalg.norm(p1 - p2)

In [22]:
# def bag_instance_dist(b_ind, i_ind):
#     bag = train_data.loc[train_data.bag_id == b_ind]
#     label = bag.iloc[0, 1] #Y
#     features = bag.iloc[:,2:].values #X
#     instance = train_data.iloc[i_ind, 2:].values
#     distances = np.linalg.norm(features - instance, axis=1)
#     return np.min(distances)

In [23]:
# N_BAGS = len(train_data.bag_id.unique())
# N_INSTANCES = train_data.shape[0]
# distances = np.ones((N_BAGS, N_INSTANCES)) * 3 # With 9 normalized features, the maximum (Euclidean) distance between two points is 3

In [24]:
# for b, b_ind in (enumerate(list(train_data.bag_id.unique()))):
#     print("Starting iteration", b+1, "out of", len(train_data.bag_id.unique()))
#     for i_ind in trange(train_data.shape[0]):
#         distance = bag_instance_dist(b_ind, i_ind)
#         distances[b, i_ind] = distance

In [25]:
# distances = {}
# for i in trange(len(features)):
#     for j in range(i, len(features)):
#         distances[(i, j)] = np.linalg.norm(features[i] - features[j])

In [26]:
# features[:25000]

In [27]:
# np.linalg.norm(_[:, np.newaxis, :] - _[np.newaxis, :, :], axis = 2)

In [28]:
# np.savetxt('250000_250000.txt', _, fmt='%d')

In [29]:
# temp = np.linalg.norm(features[25000:50000][:, np.newaxis, :] - features[25000:50000][np.newaxis, :, :], axis = 2)
# np.savetxt('25000-50000_25000-50000.txt',
#            temp,
#            fmt = '%d'
#           )
# del temp

In [30]:
# temp = np.linalg.norm(features[50000:75000][:, np.newaxis, :] - features[50000:75000][np.newaxis, :, :], axis = 2)
# np.savetxt('50000-75000_50000-75000.txt',
#            temp,
#            fmt = '%d'
#           )
# del temp

In [31]:
# temp = np.linalg.norm(features[75000:100000][:, np.newaxis, :] - features[75000:100000][np.newaxis, :, :], axis = 2)
# np.savetxt('75000-100000_75000-100000.txt',
#            temp,
#            fmt = '%d'
#           )
# del temp

In [32]:
# np.savetxt('500000_250000.txt',
#            np.linalg.norm(features[25000:][:, np.newaxis, :] - features[:25000][np.newaxis, :, :], axis = 2),
#            fmt = '%d'
#           )