In [110]:
import warnings
warnings.filterwarnings("ignore")

import os
import time
import re

from random import shuffle
from random import sample
from itertools import combinations

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goshl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
def cleandf(df):
    df = df.apply(lambda x: x.fillna(''))
    df = df.apply(lambda x: x.astype(str).str.lower())
    df = df.apply(lambda x: x.astype(str).str.replace('\W', ' ', regex=True))
    df.replace('-', ' ', inplace=True) #replace all '-' values with NaN
    df.dropna(how="all", axis=1, inplace=True) #drop columns where all values are NaN
    return df

def extract(filename, sep=','):
    df = pd.read_csv(filename, sep, engine='python', encoding = 'UTF-8')
    return cleandf(df)

def read_files(directory, sep=','):
    dfs = []
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            df = extract(f, sep)
            dfs.append(df)
    return dfs

In [167]:
dfs = read_files('data/lsh_sample/in')
len(dfs)

5

In [169]:
dfs[4]

Unnamed: 0,location_1/human_address/zip,location_1/human_address/city,location_1/longitude
0,10026,new york,73 95561864
1,10026,new york,73 95334278
2,10027,new york,73 95514646
3,10027,new york,73 94510035
4,10029,new york,73 94945722
...,...,...,...
65,11434,queens,73 7753069
66,11434,queens,73 7753069
67,11691,queens,73 75231801
68,11691,queens,73 76182654


In [170]:
dfs[0].iloc[:,0].unique()

array(['police athletic league inc ', 'the dunley milbank center',
       'intermediate school 172',
       'neighborhood defender service of harlem',
       'central park east secondary school',
       'taft hope leadership academy', 'the brotherhood sister sol inc',
       'armory high school sports foundation',
       'gregorio luperon preparatory school', 'echo park',
       'the valley   park',
       'new york center for interpersonal development', 'garcia pal',
       'women s housing and economic development corporation  whedco ',
       'jhs beacon 117', 'middle school 331 ms 306',
       'little shepherds community services inc ',
       'bronx community college', 'university heights school',
       'bronx lesbian gay health resource consortium',
       'south bronx overall economic development corp ',
       'claremont neighborhood centers inc ', 'jane addams h s ',
       'bronx leadership academy high school   h s  525',
       'roosevelt education campus   h s  435',
    

In [171]:
def get_unique_col_values(vals, min_len=2):
    col_vals = set()
    for val in vals:
        val = re.sub('[^A-Z a-z0-9]+', '', val)
        text_tokens = word_tokenize(val)
        #print(val)
        val = []
        for word in text_tokens:
            if (not word in all_stopwords):
                s = re.sub(r'\b\w{1,4}\b', '', word)
                if len(s) > min_len:
                    val.append(s)
        col_vals.update(val)
    return col_vals

def sample_records(dataset, k=1000):
    unique_vals = set()
    for c in dataset.columns:
        #print(f'column name={c}')
        #print(f'column index={dataset.columns.get_loc(c)}')
        idx = dataset.columns.get_loc(c)
        vals = dataset.iloc[:, idx].unique()
        unique_vals.update(get_unique_col_values(vals))
    l = len(unique_vals)
    if (l > k):
        unique_vals = sample(unique_vals, k)
    return unique_vals

In [184]:
#sample each column to a max of k records
k = 1000
recs = {}
vocabs = set()
for idx, df in enumerate(dfs):
    rec = sample_records(df, k)
    recs[idx] = rec
    vocabs.update(rec)    
vocabs = list(vocabs)

In [173]:
len(vocabs)

356

In [185]:
vocabs

['10459',
 '74570245',
 '84330808',
 '9830974',
 '67202421',
 '181st',
 'shepherds',
 'community',
 '69721149',
 'foundation',
 '6456301',
 '60566236',
 '11434',
 '83300944',
 'munoz',
 '76862022',
 'consortium',
 '81541032',
 'programs',
 'caribbean',
 '94581266',
 '169th',
 '67280911',
 '79363399',
 'flatbush',
 '229th',
 'whedco',
 '10026',
 'council',
 'heights',
 'tansitional',
 'jacob',
 '10032',
 'bildersee',
 '10467',
 '01249723',
 'corporation',
 'white',
 'martin',
 '98746761',
 'america',
 '10035',
 '93205641',
 'neighborhood',
 'brotherhood',
 '85799663',
 'avenue',
 '99326547',
 '67080243',
 '94917852',
 '93365273',
 'terrace',
 'erasmus',
 'united',
 'tremont',
 'school',
 '88362186',
 '91342678',
 '11432',
 '81726054',
 '88110694',
 '91051712',
 '11208',
 '69810488',
 'chabad',
 'island',
 'economic',
 '9496587',
 '85278684',
 'schools',
 '8571065',
 '67694526',
 '8865911',
 '84861204',
 '85552693',
 '91641591',
 'layton',
 'health',
 'development',
 '10468',
 'police',


In [186]:
encodeds = {}
for idx, rec in recs.items():
    encodeds[idx] = [1 if x in rec else 0 for x in vocabs]

In [187]:
print(encodeds[0])

[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 

In [188]:
#Combine all into one
def create_hash_func(size: int):
    # function for creating the hash vector/function
    hash_ex = list(range(1, size+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocabs_size: int, nbits: int):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocabs_size))
    return hashes

def create_hash(vector: list, vocabs: list):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(vocabs)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(idx)
                break
    return signature

# we create 20 minhash vectors
minhash_func = build_minhash_func(len(vocabs), 120)

In [189]:
signs = {}
for idx, enc in encodeds.items():
    signs[idx] = create_hash(enc, vocabs)

In [190]:
print(signs[0])

[48, 10, 241, 17, 145, 352, 70, 8, 317, 285, 252, 299, 46, 319, 168, 218, 178, 44, 206, 239, 4, 237, 252, 225, 132, 255, 126, 355, 223, 109, 155, 313, 292, 33, 15, 327, 140, 221, 241, 73, 201, 55, 165, 221, 81, 104, 36, 200, 285, 87, 261, 5, 217, 329, 341, 317, 83, 123, 44, 158, 66, 292, 143, 319, 149, 194, 246, 76, 157, 108, 46, 234, 237, 98, 280, 194, 281, 156, 109, 150, 276, 74, 292, 284, 177, 299, 96, 123, 291, 120, 51, 316, 76, 140, 31, 293, 4, 136, 115, 214, 106, 201, 117, 9, 136, 151, 227, 120, 76, 334, 291, 166, 133, 241, 293, 255, 128, 295, 103, 232]


In [191]:
def split_vector(signature, b):
    assert len(signature) % b == 0
    r = int(len(signature) / b)
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, len(signature), r):
        subvecs.append(signature[i : i+r])
    return subvecs

In [192]:
bands = {}
for idx, sign in signs.items():
    bands[idx] = split_vector(sign, 40)

bands[0]

[[48, 10, 241],
 [17, 145, 352],
 [70, 8, 317],
 [285, 252, 299],
 [46, 319, 168],
 [218, 178, 44],
 [206, 239, 4],
 [237, 252, 225],
 [132, 255, 126],
 [355, 223, 109],
 [155, 313, 292],
 [33, 15, 327],
 [140, 221, 241],
 [73, 201, 55],
 [165, 221, 81],
 [104, 36, 200],
 [285, 87, 261],
 [5, 217, 329],
 [341, 317, 83],
 [123, 44, 158],
 [66, 292, 143],
 [319, 149, 194],
 [246, 76, 157],
 [108, 46, 234],
 [237, 98, 280],
 [194, 281, 156],
 [109, 150, 276],
 [74, 292, 284],
 [177, 299, 96],
 [123, 291, 120],
 [51, 316, 76],
 [140, 31, 293],
 [4, 136, 115],
 [214, 106, 201],
 [117, 9, 136],
 [151, 227, 120],
 [76, 334, 291],
 [166, 133, 241],
 [293, 255, 128],
 [295, 103, 232]]

In [193]:
zips = list(combinations(bands.items(), 2))

for z in zips:
    print(z)

((0, [[48, 10, 241], [17, 145, 352], [70, 8, 317], [285, 252, 299], [46, 319, 168], [218, 178, 44], [206, 239, 4], [237, 252, 225], [132, 255, 126], [355, 223, 109], [155, 313, 292], [33, 15, 327], [140, 221, 241], [73, 201, 55], [165, 221, 81], [104, 36, 200], [285, 87, 261], [5, 217, 329], [341, 317, 83], [123, 44, 158], [66, 292, 143], [319, 149, 194], [246, 76, 157], [108, 46, 234], [237, 98, 280], [194, 281, 156], [109, 150, 276], [74, 292, 284], [177, 299, 96], [123, 291, 120], [51, 316, 76], [140, 31, 293], [4, 136, 115], [214, 106, 201], [117, 9, 136], [151, 227, 120], [76, 334, 291], [166, 133, 241], [293, 255, 128], [295, 103, 232]]), (1, [[119, 79, 321], [180, 278, 333], [270, 310, 112], [300, 0, 79], [58, 154, 241], [321, 348, 89], [297, 249, 321], [180, 350, 118], [309, 170, 350], [0, 332, 111], [32, 238, 107], [0, 348, 107], [282, 118, 241], [137, 328, 324], [238, 111, 321], [198, 32, 310], [253, 209, 309], [89, 217, 198], [105, 62, 242], [278, 137, 278], [278, 12, 118], 

In [195]:
for a_band, b_band in zips:
    ka, va = a_band
    kb, vb = b_band
    print(f"Checking: ({ka}) vs ({kb})")
    for a_rows, b_rows in zip(va, vb):
        #print(f"a_rows({a_rows}), b_rows({b_rows})")
        #if sorted(a_rows) == sorted(b_rows):
        if a_rows == b_rows:
            print(f"Candidate pair: ({ka}){a_rows} == ({kb}){b_rows}")

Checking: (0) vs (1)
Checking: (0) vs (2)
Candidate pair: (0)[48, 10, 241] == (2)[48, 10, 241]
Candidate pair: (0)[17, 145, 352] == (2)[17, 145, 352]
Candidate pair: (0)[70, 8, 317] == (2)[70, 8, 317]
Candidate pair: (0)[165, 221, 81] == (2)[165, 221, 81]
Candidate pair: (0)[285, 87, 261] == (2)[285, 87, 261]
Candidate pair: (0)[341, 317, 83] == (2)[341, 317, 83]
Candidate pair: (0)[108, 46, 234] == (2)[108, 46, 234]
Candidate pair: (0)[194, 281, 156] == (2)[194, 281, 156]
Candidate pair: (0)[51, 316, 76] == (2)[51, 316, 76]
Candidate pair: (0)[4, 136, 115] == (2)[4, 136, 115]
Candidate pair: (0)[166, 133, 241] == (2)[166, 133, 241]
Candidate pair: (0)[293, 255, 128] == (2)[293, 255, 128]
Checking: (0) vs (3)
Checking: (0) vs (4)
Checking: (1) vs (2)
Checking: (1) vs (3)
Candidate pair: (1)[170, 332, 111] == (3)[170, 332, 111]
Checking: (1) vs (4)
Candidate pair: (1)[238, 111, 321] == (4)[238, 111, 321]
Candidate pair: (1)[170, 332, 111] == (4)[170, 332, 111]
Checking: (2) vs (3)
Check