## Import Required Libraries 

In [1]:
import json
import time
import os
import csv
import json
import numpy as np
import pandas as pd
import random
from itertools import combinations
import math
import pickle

## Load Signature Matrix

In [2]:
# Open file containing Signature Matrix
fr = open('../data/signature_matrix_10_hashes.csv', 'r')
reader = csv.reader(fr)

# Initial signature matrix array
sig_mat = np.empty([10, 2262292], dtype=int)

# Loop through each row of signature matrix file
t0 = time.time()
count = 0
for row in reader:
    
    # Convert string of signature row to integers
    row_2_int = []
    for ele in row:
        row_2_int.append(int(ele))
    sig_mat[count] = row_2_int
    
    # Display status
    count = count + 1
    if count%100 == 0:
        print('Row', count, '-', time.time()-t0, 'sec')
        
# Close file
fr.close()

The signature matrix is loaded here instead of calculated in place due to memory constraints.

## Convert Signature Matrix to LSH Matrix 

In [3]:
# Transpose signature matrix for easier processing
sig_mat = sig_mat.T

# Open file to write LSH values
fw = open('../data/LSH_matrix_1rows_10bands.csv', 'w', newline='')
writer = csv.writer(fw, delimiter=',')

# Set number of rows and bands
bands = 10
num_rows = 1

# Loop through each track in signature matrix
t0 = time.time()
for track in range(0, len(sig_mat)):
    
    # Loop through each band and hash rows in band
    LSH_per_track = []
    for band in range(0, bands):
        LSH_val = ''
        for i in range(num_rows*(band), num_rows*(band+1)):
            LSH_val = LSH_val + str(sig_mat[track][i])
        
        # Append hash value to list
        LSH_per_track.append(LSH_val)
    
    # Write hash values to file
    writer.writerow(LSH_per_track)
    
    # Display status
    if int(track)%1000000 == 0:
        print('Track', track, '-', time.time()-t0, 'sec')
        
# Close file
fw.close()

# Clear out memory
LSH_val = []
LSH_per_track = []

Track 0 - 0.0010559558868408203 sec
Track 1000000 - 14.83893370628357 sec
Track 2000000 - 29.999577045440674 sec


Values within bands are hashed together simply being concatenating the row values together is strings.

## Get Candidate Pairs

In [4]:
# Open file containing LSH values
fr = open('../data/LSH_matrix_1rows_10bands.csv', 'r')
reader = csv.reader(fr, delimiter=',')
bands = 10
num_rows = 1

# Initialize dictionary to store hash values and candidate pairs
hash_value = {}
can_pairs = {}
for band in range(0,bands):
    hash_value[band] = {}
    can_pairs[band] = {}
    
# Initialize  counter
count = [0]*bands
track_num = 0
t0 = time.time()

# Loop through each track in LSH file
for track in reader:
    
    # Loop through each LSH value
    for band in range(0, len(track)):
        
        # Check if LSH value already exists in dictionary
        ID = hash_value[band].get(track[band], '')
        
        # Add new value to dictionaries
        if len(str(ID)) == 0:
            hash_value[band][track[band]] = count[band]
            can_pairs[band][count[band]] = list([track_num])
            count[band] = count[band] + 1
        
        # Add value to existing dictionary entry
        else:
            idx = can_pairs[band].get(ID, '')
            idx.append(track_num)
            can_pairs[band][ID] = idx

    # Display Progress
    track_num = track_num + 1
    if track_num%100000==0:
        print('Track', track_num, '-', time.time()-t0, 'sec')


Track 100000 - 1.3220610618591309 sec
Track 200000 - 2.5837528705596924 sec
Track 300000 - 4.077269792556763 sec
Track 400000 - 5.4822328090667725 sec
Track 500000 - 6.625729560852051 sec
Track 600000 - 8.103623151779175 sec
Track 700000 - 9.237990140914917 sec
Track 800000 - 10.766067028045654 sec
Track 900000 - 11.90809941291809 sec
Track 1000000 - 13.064009189605713 sec
Track 1100000 - 14.15366268157959 sec
Track 1200000 - 15.757372856140137 sec
Track 1300000 - 16.84849500656128 sec
Track 1400000 - 17.93740177154541 sec
Track 1500000 - 19.00943922996521 sec
Track 1600000 - 20.065755367279053 sec
Track 1700000 - 21.809739351272583 sec
Track 1800000 - 22.83500385284424 sec
Track 1900000 - 23.876986980438232 sec
Track 2000000 - 24.886589288711548 sec
Track 2100000 - 25.896191596984863 sec
Track 2200000 - 26.921202898025513 sec


This first identifies unique hash value for each band. Those hash values are mapped to integer values to cut down on memory to store the hash values. Tracks containing the same hash value for a band are grouped together in a dictionary with their corresponding integer hash value.

In [5]:
#Initialize dictionary to store candidate pairs
cp_dict = dict()

# Loop through each band of LSH matrix
t0 = time.time()
for band in range(0,bands):
    
    # Loop through each unique LSH value
    count = 0
    for key, value in can_pairs[band].items():
        
        # If LSH value has moe than one track, create candidate pair
        if len(value) > 1:
            
            # Create combinations of track in LSH value
            combs = list(combinations(value, 2))

            # Process pair to list in a sorted order
            for single_pair in combs:
                single_pair = list(single_pair)
                single_pair.sort()
                
                # Add candidate pair to dictionary so that duplicates are discarded
                cp_dict[str(single_pair[0]) + ' ' + str(single_pair[1])]=1
        
        # Display status
        count = count+1
        if (count%100000 == 0) & (count > 0):
            print('Band:', band, 'Pair:', count, 'Elapsed Time:', time.time()-t0)


Band: 0 Pair: 100000 Elapsed Time: 30.43355631828308
Band: 0 Pair: 200000 Elapsed Time: 38.3123893737793
Band: 0 Pair: 300000 Elapsed Time: 41.36923098564148
Band: 2 Pair: 100000 Elapsed Time: 192.26182293891907
Band: 2 Pair: 200000 Elapsed Time: 205.5378818511963
Band: 3 Pair: 100000 Elapsed Time: 404.07649302482605
Band: 4 Pair: 100000 Elapsed Time: 616.5628888607025
Band: 5 Pair: 100000 Elapsed Time: 800.0707890987396
Band: 5 Pair: 200000 Elapsed Time: 834.2893526554108
Band: 6 Pair: 100000 Elapsed Time: 1010.415447473526
Band: 6 Pair: 200000 Elapsed Time: 1052.432700395584
Band: 6 Pair: 300000 Elapsed Time: 1068.1772487163544
Band: 7 Pair: 100000 Elapsed Time: 1290.4585177898407
Band: 7 Pair: 200000 Elapsed Time: 1334.1056187152863
Band: 7 Pair: 300000 Elapsed Time: 1346.0175111293793
Band: 8 Pair: 100000 Elapsed Time: 1614.2044532299042
Band: 8 Pair: 200000 Elapsed Time: 1660.2299053668976
Band: 9 Pair: 100000 Elapsed Time: 2099.1339468955994


This step goes through each hash value for each bands and produces the each combination of candidates pairs based on those binning. The combinations are then added to a dictionary to remove all duplicate candidate pairs.

In [7]:
# Initialize numpy array to store all candidate pairs
cand_pair_array = np.empty([len(cp_dict)*2,2], dtype=int)

# Loop through each candidate pair in dictionary
count = 0
t0 = time.time()
for key in cp_dict:
    
    # Split string into track of candidate pair
    can_pair_values = key.split()
    can_pair_value_0 = int(can_pair_values[0])
    can_pair_value_1 = int(can_pair_values[1])
    
    # Add candidate to array
    cand_pair_array[2*count] = [can_pair_value_0, can_pair_value_1]
    
    # Add candidate pair in reverse order to easier searching later
    cand_pair_array[2*count+1] = [can_pair_value_1, can_pair_value_0]
    
    # Display status
    count = count + 1
    if (count%25000000 == 0):
        print('Candidate Pair:', count, 'Elapsed Time:', time.time()-t0) 
    

Candidate Pair: 25000000 Elapsed Time: 113.16815376281738
Candidate Pair: 50000000 Elapsed Time: 225.45030426979065
Candidate Pair: 75000000 Elapsed Time: 347.2551169395447
Candidate Pair: 100000000 Elapsed Time: 480.99227643013
Candidate Pair: 125000000 Elapsed Time: 611.557375907898
Candidate Pair: 150000000 Elapsed Time: 727.8755617141724
Candidate Pair: 175000000 Elapsed Time: 852.7852604389191
Candidate Pair: 200000000 Elapsed Time: 974.5959830284119
Candidate Pair: 225000000 Elapsed Time: 1101.9907069206238
Candidate Pair: 250000000 Elapsed Time: 1228.560359954834
Candidate Pair: 275000000 Elapsed Time: 1354.701090335846
Candidate Pair: 300000000 Elapsed Time: 1460.0194356441498
Candidate Pair: 325000000 Elapsed Time: 1569.8908081054688


The candidate pair dictionary is now converted to a numpy array due to the decreased size of storage. Each candidate pair is added to the array twice: once in order 0, 1 and once in order 1, 0. Later when searching for similar songs, it will be easier being able to search just the first column of the candidate pairs array instead of both columns.

In [8]:
# Write Candidate Pair array to pickle file
t0 = time.time()
with open('../data/candidate_pair_1rows_10bands.pickle', 'wb') as f:
    pickle.dump(cand_pair_array, f, protocol=pickle.HIGHEST_PROTOCOL)

# Display Status
print('Elapsed time:', time.time()-t0)

Elapsed time: 32.90532469749451
