In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

DIVERSE_DENSITY_POINTS_FILEPATH = "./../data/curated/max_diverse_density_points.csv"
TEST_DATA_PATH = "./../data/curated/dataset_scaled.csv"
BAG_META_PATH = "../data/raw/bag_meta.csv"

# Performing Bag Embedding

I didn't bother doing train-test-split because the training data makes up 20,000 out of 11,000,000 of the datapoints lol. Just predict on everything

In [2]:
test_data_instance_level = pd.read_csv(TEST_DATA_PATH).iloc[:, 1:].sort_values(by="bag_id")
max_diverse_density_points = pd.read_csv(DIVERSE_DENSITY_POINTS_FILEPATH).iloc[:, 3:].values
bag_meta = pd.read_csv(BAG_META_PATH).iloc[:, 1:]

In [3]:
test_data_instance_level

Unnamed: 0,bag_id,label,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
6591702,0,0,0.098361,0.011491,0.145363,0.040414,0.010517,0.279156,0.100059,0.009703,0.389024
6591703,0,0,0.044871,0.011879,0.186717,0.048702,0.011439,0.254342,0.016381,0.007908,0.381707
6591705,0,0,0.042082,0.015425,0.179198,0.085375,0.010517,0.269231,0.059305,0.008506,0.389024
6591701,0,0,0.041744,0.009451,0.164160,0.068505,0.014158,0.259305,0.199724,0.010138,0.407317
6591700,0,0,0.015126,0.008043,0.172932,0.020170,0.008575,0.292804,0.009868,0.004362,0.379268
...,...,...,...,...,...,...,...,...,...,...,...
11025768,121837,0,0.019605,0.007023,0.313283,0.136717,0.009157,0.291563,0.016381,0.006929,0.350000
11025769,121837,0,0.028055,0.005857,0.241855,0.021930,0.013333,0.299007,0.068778,0.013619,0.386585
11025770,121837,0,0.053321,0.008917,0.259398,0.049875,0.011585,0.330025,0.037004,0.008724,0.396341
11025733,121837,0,0.036505,0.007314,0.288221,0.046281,0.013381,0.317618,0.027728,0.009540,0.402439


In [4]:
def bag_instance_dist(b_ind, p):
    features_of_instances_in_bag_b_ind = test_data_instance_level.loc[test_data_instance_level.bag_id == b_ind].iloc[:,2:].values
    distances = np.linalg.norm(features_of_instances_in_bag_b_ind - p, axis=1)
    return np.min(distances)

In [5]:
# Embedding dataset
test_set = bag_meta.loc[:, ["bag_id", "label"]]

In [6]:
feature_names = [pos + "_" + stat for pos in ["left", "mid", "right"] for stat in ["dwell", "std", "mean"]]
for j in trange(len(max_diverse_density_points)):
    x = max_diverse_density_points[j, :]
    
    command = ""
    for i in range(len(feature_names)):
        command += f"(test_data_instance_level.{feature_names[i]} - x[{i}]) ** 2 +"
    command = command[:-2]
    command = "np.sqrt(" + command + ")"
    distances = eval(command) # This stores the euclidean distance between every instance and x
    temp = pd.concat([test_data_instance_level, distances], axis = 1).iloc[:,[0,1,-1]]
    temp.columns = ["bag_id", "label", "distances"]
    bag_distances = temp.groupby(["bag_id", "label"]).agg('min').reset_index().distances
    test_set[f"p{j}"] = bag_distances


100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [01:38<00:00,  2.53s/it]


In [7]:
test_set

Unnamed: 0,bag_id,label,p0,p1,p2,p3,p4,p5,p6,p7,...,p29,p30,p31,p32,p33,p34,p35,p36,p37,p38
0,0,0,0.271662,0.435573,0.311283,0.583093,0.545268,0.355944,0.501498,0.477741,...,0.542194,0.456449,0.399198,0.421277,0.431298,0.426177,0.416957,0.552836,0.524126,0.455041
1,1,0,0.142827,0.324119,0.111936,0.466192,0.347558,0.210668,0.385540,0.416273,...,0.402169,0.301454,0.231317,0.361316,0.308517,0.392676,0.347223,0.379180,0.406348,0.377297
2,2,0,0.261013,0.439264,0.261875,0.620451,0.522659,0.375327,0.487647,0.499055,...,0.546323,0.476110,0.381911,0.465840,0.462737,0.457357,0.454822,0.532831,0.559887,0.499465
3,3,0,0.309778,0.378574,0.270067,0.610070,0.515984,0.377568,0.429252,0.479326,...,0.547904,0.466583,0.336760,0.436672,0.442321,0.445165,0.446034,0.518008,0.538045,0.467484
4,4,0,0.239075,0.428717,0.226388,0.605785,0.519715,0.359208,0.475191,0.506880,...,0.538473,0.479122,0.352771,0.473864,0.455319,0.456473,0.458268,0.535884,0.572169,0.502860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,121833,1,0.222654,0.053405,0.221843,0.307363,0.141251,0.300842,0.134699,0.193200,...,0.180049,0.117380,0.086907,0.183919,0.139318,0.249594,0.233241,0.152331,0.203170,0.179738
121834,121834,0,0.237136,0.038692,0.208411,0.332359,0.196599,0.348656,0.124921,0.240416,...,0.304265,0.231611,0.068902,0.222897,0.151068,0.223566,0.238266,0.169524,0.276922,0.198176
121835,121835,1,0.297714,0.143273,0.319549,0.298110,0.195007,0.349392,0.228576,0.247237,...,0.199053,0.113111,0.189713,0.198477,0.200882,0.261869,0.212954,0.207116,0.118847,0.157109
121836,121836,0,0.120391,0.321219,0.128294,0.392451,0.299622,0.142560,0.359546,0.425597,...,0.293409,0.196268,0.191309,0.388445,0.267256,0.457666,0.410211,0.337628,0.387855,0.386015
