In [30]:
import numpy as np
from scipy.spatial.distance import hamming

In [62]:
nbits = 4  # number of hyperplanes and binary vals to produce
d = 3  # vector dimensions

In [63]:
# create a set of 4 hyperplanes, with 2 dimensions
plane_norms = np.random.rand(nbits, d) - .5
plane_norms

array([[ 0.18442156,  0.16350865, -0.25483982],
       [-0.3847968 , -0.34725841, -0.00256127],
       [-0.45834603, -0.40843465,  0.0030024 ],
       [ 0.10384659,  0.11345144,  0.15915523]])

In [64]:
a = np.asarray([1, 2, 3])
b = np.asarray([12, 11, 15])
c = np.asarray([3, 10, 14])

In [65]:
# calculate the dot product for each of these
a_dot = np.dot(a, plane_norms.T)
b_dot = np.dot(b, plane_norms.T)
c_dot = np.dot(c, plane_norms.T)
print(a_dot)
print(b_dot)
print(c_dot)

[-0.25308057 -1.08699742 -1.26620814  0.80821516]
[ 0.18905673 -8.47582309 -9.94789756  4.8814534 ]
[-1.37940618 -4.66283225 -5.41735104  3.6742274 ]


In [66]:
# we know that a positive dot product == +ve side of hyperplane
# and negative dot product == -ve side of hyperplane
#a_dot = a_dot > 0
#b_dot = b_dot > 0
#c_dot = c_dot > 0
print(a_dot)
print(b_dot)
print(c_dot)

[-0.25308057 -1.08699742 -1.26620814  0.80821516]
[ 0.18905673 -8.47582309 -9.94789756  4.8814534 ]
[-1.37940618 -4.66283225 -5.41735104  3.6742274 ]


In [67]:
# convert our boolean arrays to int arrays to make bucketing
# easier (although is okay to use boolean for Hamming distance)
a_dot = a_dot.astype(int)
b_dot = b_dot.astype(int)
c_dot = c_dot.astype(int)
print(a_dot)
print(b_dot)
print(c_dot)

[ 0 -1 -1  0]
[ 0 -8 -9  4]
[-1 -4 -5  3]


In [68]:
vectors = [a_dot, b_dot, c_dot]
buckets = {}
i = 0

for i in range(len(vectors)):
    # convert from array to string
    hash_str = ''.join(vectors[i].astype(str))
    # create bucket if it doesn't exist
    if hash_str not in buckets.keys():
        buckets[hash_str] = []
    # add vector position to bucket
    buckets[hash_str].append(i)

print(buckets)

{'0-1-10': [0], '0-8-94': [1], '-1-4-53': [2]}


In [29]:
d = np.asarray([-1, -5])
d_dot = np.dot(d, plane_norms.T)
print(d_dot)
d_dot = d_dot.astype(int)
print(d_dot)

[ 1.83380459 -0.66170986  0.71631142 -2.4362924 ]
[ 1  0  0 -2]


In [48]:
for k in buckets.keys():
    k = [int(i) for i in k]
    print(f'Distance {d_dot} vs {k}: [{hamming(d_dot, k)}]')

Distance [ 1  0  0 -2] vs [0, 1, 0, 1]: [0.75]
Distance [ 1  0  0 -2] vs [0, 0, 0, 1]: [0.5]


## LSH with RANDOM PROJECTION ##

In [17]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import random
import time

def generate_synthetic_data(n_samples, n_features):
    data = {
        'int_col': np.random.randint(0, 100, size=n_samples),
        'float_col': np.random.rand(n_samples),
        'date_col': pd.date_range(start='2023-01-01', periods=n_samples, freq='D'),
        'bool_col': np.random.choice([True, False], size=n_samples),
        'string_col': [''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5)) for _ in range(n_samples)],
    }

    for col in range(6, n_features + 6):
        if col % 3 == 0:
            data[f'col{col}'] = np.random.randn(n_samples)
        elif col % 3 == 1:
            data[f'col{col}'] = np.random.choice([1, 2, 3], size=n_samples)
        else:
            data[f'col{col}'] = np.random.choice(['A', 'B', 'C'], size=n_samples)

    df = pd.DataFrame(data)
    return df

def pad_or_trim_data(original_data, n_components):
    if n_components < original_data.shape[0]:
        # Trim the data to the desired number of components
        padded_data = original_data[:n_components]
    elif n_components > original_data.shape[0]:
        # Calculate statistics from the original data
        mean = np.mean(original_data)
        std = np.std(original_data)
        min_val = np.min(original_data)
        max_val = np.max(original_data)

        # Number of dimensions to pad
        padding_dims = n_components - original_data.shape[0]

        # Generate synthetic data with similar distribution
        synthetic_data = []
        for i in range(padding_dims):
            # Generate synthetic values based on original data's statistics
            synthetic_values = np.random.normal(loc=mean, scale=std)
            synthetic_values = np.clip(synthetic_values, min_val, max_val)  # Ensure values are within original range
            synthetic_data.append(synthetic_values)

        # Combine original and synthetic data
        padded_data = np.hstack((original_data, np.array(synthetic_data)))
    else:
        # No padding or trimming needed
        padded_data = original_data

    return padded_data

def get_sparsity(column):
    missing_values = column.isnull().sum()

    # Calculate the total number of values in the column
    total_values = column.count() + missing_values

    # Calculate the sparsity
    sparsity = (missing_values / total_values) * 100
    return sparsity

def extract_metadata(in_column):
    metadata_length = 10
    metadata = [0] * metadata_length

    column = in_column.dropna()

    # meta-features
    if column.dtype in ['int32', 'float32', 'int64', 'float64']:
        metadata[0] = column.min()
        metadata[1] = column.max()
        metadata[2] = column.mean()
        metadata[3] = column.median()
        metadata[4] = column.std()
        metadata[5] = column.skew()
        metadata[6] = get_sparsity(column)
    elif column.dtype in ['string', 'object', 'category']:
        metadata[7] = len(column.unique())
    elif column.dtype == 'datetime64':
        metadata[8] = column.min().timestamp()
        metadata[9] = column.max().timestamp()
    elif column.dtype == 'timedelta64':
        metadata[8] = column.min().total_seconds()
        metadata[9] = column.max().total_seconds()
    elif column.dtype == 'bool':
        metadata[2] = column.mean()

    # Convert the metadata list to a numpy array
    metadata = np.array(metadata).reshape(-1, 1)
    metadata = np.nan_to_num(metadata, nan=0.0)

    # Scale the metadata using MinMaxScaler
    #scaler = MinMaxScaler()
    scaler = StandardScaler()
    metadata = np.array(metadata).reshape(-1, 1)
    metadata = scaler.fit_transform(metadata)
    metadata = metadata.flatten().tolist()

    return metadata

def get_dataset_metadata(dataset):
    dataset_metadata = []
    for col in dataset.columns:
        column = dataset[col]
        metadata = extract_metadata(column)
        dataset_metadata.extend(metadata)

    return np.array(dataset_metadata)

def get_largest_shape(dataframes):
    largest_shape = 0
    for df in dataframes:
        shape_0 = df.shape[0]
        if shape_0 > largest_shape:
            largest_shape = shape_0
    return largest_shape

def get_best_nbits(n_samples, buckets=[2, 4, 8, 16, 24, 32, 64]):
    best_nbits = 0
    prev_samples_buckets = np.inf
    for nbits in buckets:
        buckets = 1 << nbits
        samples_buckets = n_samples/buckets
        print(f"nbits == {nbits}")
        print(f"{n_samples} / {buckets} = {samples_buckets}, samples_buckets: {samples_buckets}, prev_samples_buckets:{prev_samples_buckets}")
        if samples_buckets > 0.1 and samples_buckets < prev_samples_buckets:
            prev_samples_buckets = samples_buckets
            best_nbits = nbits
        else:
            break
        
    return best_nbits

def simulate(n_data, min_rows, max_rows, min_cols, max_cols):
    dfs_metadata = []
    for i in range(n_data):
        rows = random.randint(min_rows, max_rows)
        cols = random.randint(min_cols, max_cols)
        
        df = generate_synthetic_data(rows, cols)
        df_metadata = get_dataset_metadata(df)
        dfs_metadata.append(df_metadata)
    
    largest_shape = get_largest_shape(dfs_metadata)

    d = largest_shape
    nbits = get_best_nbits(len(dfs_metadata))
    print(f"best_nbits:{nbits}")
    plane_norms = np.random.rand(nbits, d) - .5
    
    vectors = []
    padded_metadata = []
    for df_metadata in dfs_metadata:
        padded_meta = pad_or_trim_data(df_metadata, largest_shape)
        dot_data = np.dot(padded_meta, plane_norms.T)
        dot_data = dot_data > 0
        dot_data = dot_data.astype(int)
        vectors.append(dot_data)
        padded_metadata.append(padded_meta)
    
    buckets = {}
    i = 0
    for i in range(len(vectors)):
        # convert from array to string
        hash_str = ''.join(vectors[i].astype(str))
        # create bucket if it doesn't exist
        if hash_str not in buckets.keys():
            buckets[hash_str] = []
        # add vector position to bucket
        buckets[hash_str].append(i)

    return largest_shape, plane_norms, buckets, dfs_metadata, padded_metadata

def generate_hash(df, nbits, largest_shape, plane_norms):
    df_metadata = get_dataset_metadata(df)
    padded_metadata = pad_or_trim_data(df_metadata, largest_shape)
    dot_data = np.dot(padded_metadata, plane_norms.T)
    dot_data = dot_data > 0
    dot_data = dot_data.astype(int)
    hash_str = ''.join(dot_data.astype(str))
    return hash_str

def get_dist(n_samples, buckets=[2, 4, 8, 16, 24, 32, 64]):
    best_nbits = get_best_nbits(n_samples)
    print(f"best_nbits:{best_nbits}")
    for nbits in buckets:
        print(f"nbits: {nbits}, buckets: {1 << nbits}")

### Usage: ###

In [18]:
s_t = time.strftime("%H:%M:%S", time.localtime())
start_time = time.perf_counter()
print(f"Program started at {s_t}")

nsamples = 50
largest_shape, plane_norms, buckets, dfs_metadata, padded_metadata = simulate(nsamples, 1, 10, 5, 10)
#print(buckets)

finish_time = time.perf_counter()
e_t = time.strftime("%H:%M:%S", time.localtime())
print(f"Program finished at {e_t} in {(finish_time-start_time)/60} minutes")

Program started at 06:14:06
nbits == 2
50 / 4 = 12.5, samples_buckets: 12.5, prev_samples_buckets:inf
nbits == 4
50 / 16 = 3.125, samples_buckets: 3.125, prev_samples_buckets:12.5
nbits == 8
50 / 256 = 0.1953125, samples_buckets: 0.1953125, prev_samples_buckets:3.125
nbits == 16
50 / 65536 = 0.000762939453125, samples_buckets: 0.000762939453125, prev_samples_buckets:0.1953125
best_nbits:8
Program finished at 06:14:07 in 0.019301051666665594 minutes


In [14]:
get_dist(nsamples)

nbits == 2
50 / 4 = 12.5, samples_buckets: 12.5, prev_samples_buckets:inf
nbits == 4
50 / 16 = 3.125, samples_buckets: 3.125, prev_samples_buckets:12.5
nbits == 8
50 / 256 = 0.1953125, samples_buckets: 0.1953125, prev_samples_buckets:3.125
nbits == 16
50 / 65536 = 0.000762939453125, samples_buckets: 0.000762939453125, prev_samples_buckets:0.1953125
best_nbits:8
nbits: 2, buckets: 4
nbits: 4, buckets: 16
nbits: 8, buckets: 256
nbits: 16, buckets: 65536
nbits: 24, buckets: 16777216
nbits: 32, buckets: 4294967296
nbits: 64, buckets: 18446744073709551616


In [19]:
class Node:
    def __init__(self, name):
        self.name = name
        
class Edge:
    def __init__(self, node_left, node_right, edge_weight=0):
        self.node_left = node_left
        self.node_right = node_right
        self.edge_weight = edge_weight
        
    def getLeft(self):
        return self.node_left
    
    def getWeight(self):
        return self.edge_weight
    
    def setWeight(self, weight):
        self.edge_weight = weight
    
    def getRight(self):
        return self.node_right 
    
    def __str__(self):
        return f"Node[{self.node_left.name}]-Weight[{self.edge_weight}]-Node[{self.node_right.name}]"

from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

edges = []
for k, v in buckets.items():
    combs = list(combinations(v, 2))
    #print(combs)
    for (l, r) in combs:
        left = dfs_metadata[l].reshape(1, -1)
        right = dfs_metadata[r].reshape(1, -1)
        #print(f"left.shape:{left},right.shape:{right}")
        padded_left = padded_metadata[l].reshape(1, -1)
        padded_right = padded_metadata[r].reshape(1, -1)
        #print(f"padded.left.shape:{padded_left},padded.right.shape:{padded_right}")
        score = cosine_similarity(padded_left, padded_right)
        if score > 0.5:
            edges.append(Edge(Node(l), Node(r), score[0, 0]))

for edge in edges:
    print(edge)

Node[0]-Weight[0.5515950267949342]-Node[5]
Node[0]-Weight[0.6980515092158206]-Node[10]
Node[0]-Weight[0.8014273364837243]-Node[23]
Node[0]-Weight[0.7639569741695945]-Node[27]
Node[0]-Weight[0.7198104739571907]-Node[30]
Node[0]-Weight[0.7492537624773558]-Node[44]
Node[0]-Weight[0.8155106282637982]-Node[45]
Node[5]-Weight[0.5606434566022221]-Node[23]
Node[5]-Weight[0.5208887965411048]-Node[30]
Node[5]-Weight[0.5115873207100843]-Node[44]
Node[5]-Weight[0.5111077239416677]-Node[45]
Node[10]-Weight[0.8020814217178334]-Node[23]
Node[10]-Weight[0.756561431792334]-Node[27]
Node[10]-Weight[0.811692930946754]-Node[30]
Node[10]-Weight[0.8433496723925208]-Node[44]
Node[10]-Weight[0.70986396751718]-Node[45]
Node[23]-Weight[0.8010804685412658]-Node[27]
Node[23]-Weight[0.807011788240516]-Node[30]
Node[23]-Weight[0.8411444708545557]-Node[44]
Node[23]-Weight[0.8053743509370538]-Node[45]
Node[27]-Weight[0.6559904221477747]-Node[30]
Node[27]-Weight[0.842255084724888]-Node[44]
Node[27]-Weight[0.7656906498

In [366]:
df = generate_synthetic_data(100, 10)
hash_str = generate_hash(df, nbits, largest_shape, plane_norms)
print(hash_str)
ids = buckets.get(hash_str)
print(ids)
for i in ids:
    print(dfs_metadata[i])

0111
[3, 5, 7, 8, 12, 13, 14, 17, 23, 25, 27, 28, 29, 31, 32, 33, 35, 36, 39, 40, 42, 43, 49]
[-0.40664105  1.9312138   1.02595421  1.31598884  0.1865727  -0.82693943
 -0.80653727 -0.80653727 -0.80653727 -0.80653727 -0.39580263  0.86530798
 -0.03053237 -0.20920083 -0.23987606  2.68415787 -0.66851349 -0.66851349
 -0.66851349 -0.66851349  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
 -0.33333333 -0.33333333  3.         -0.33333333 -0.33333333 -0.33333333
 -0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33333333
 -0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33333333  3.
 -0.33333333 -0.33333333 -2.00066183  1.39270513 -0.69867127 -0.87356715
  1.04034698  1.3102971  -0.04261224 -0.04261224 -0.04261224 -0.04261224
  0.11339233  2.02960706  1.0714997   1.0714997  -0.06242364 -0.84471503
 -0.84471503 -0.84471503 -0.84471503 -0.84471503 -0.33333333 -0.33333333
 -0.33333333 -0.33333333 -0.33333333 -0.33333333 -0.33

In [334]:
from itertools import combinations
print(list(combinations(['A', 'B', 'C', 'D', 'E'], 2)))

[('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E'), ('B', 'C'), ('B', 'D'), ('B', 'E'), ('C', 'D'), ('C', 'E'), ('D', 'E')]


In [22]:
def convert_keys_to_indices(data_dict):
    indexed_dict = {}
    for idx, (key, value) in enumerate(data_dict.items()):
        indexed_dict[idx] = value
    return indexed_dict

dfs_metadata = {'A': [1, 2, 3], 'B': [4, 5]}
indexed_metadata = convert_keys_to_indices(dfs_metadata)
print(indexed_metadata)
second_key = list(dfs_metadata.keys())[1]
print(second_key)

{0: [1, 2, 3], 1: [4, 5]}
B
