## Import Required Libraries 

In [1]:
import pandas as pd
import numpy as np
import time
import os
import csv
import json
import random
from itertools import combinations

## Read CSV File Mapping Playlist to Track

In [2]:
# Load Playlist to Track ID data
df_track = pd.read_csv('../data/track_w_playlist_id.csv')
df_track

Unnamed: 0,Playlist ID,Track ID
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
65464771,999999,466732
65464772,999999,466734
65464773,999999,582308
65464774,999999,582311


## Construct Utility Matrix

In [3]:
# Get list of playlists and tracks
pl_sort = df_track['Playlist ID'].to_list()
track_sort = df_track['Track ID'].to_list()

# Initialize dictionary to map tracks to playlist
tracks_to_playlist = dict([]);

# Loop through each track
for idx in range(0, len(track_sort)):
    
    # Get track and playlist for entry
    song = track_sort[idx]
    pl = pl_sort[idx]
    
    # Add playlist to list of playlist that track occurs in
    playlist_list = tracks_to_playlist.get(song, [])
    playlist_list.append(pl)

    # Add playlist list to track to playlist dictionary
    tracks_to_playlist[song] = playlist_list
    
# Clear varaibles no longer needed
df_track = []
track_sort  = []
pl_sort = []
playlist_list = []

## Construct Signature Matrix

In [4]:
# Our random hash function will take the form of 
# h(x) = (a * x + b) % c
# where 'x' is the input value, 'a' and 'b' are random coefficients, 
# and 'c' is a prime number just greater than maxShingleID.

# Generate a list of 'k' random coefficients for the random hash functions, 
# while ensuring that the same value does not appear multiple times in the list.
def pickRandomCoeffs(k, max_size):
  # Create a list of 'k' random values.
  randList = []

  while k > 0:
    # Get a random shingle ID.
    randIndex = random.randint(0, max_size)

    # Ensure that each random number is unique.
    while randIndex in randList:
      randIndex = random.randint(0, max_size)

    # Add the random number to the list.
    randList.append(randIndex)
    k = k - 1

  return randList 

In [5]:
# Set parameters to be used in minHash
num_hash = 10
max_value = 1000000
coeffA = pickRandomCoeffs(num_hash, max_value)
coeffB = pickRandomCoeffs(num_hash, max_value)

In [6]:
# Open file to for signature matrix 
f = open('../data/signature_matrix_10_hashes.csv', 'w', newline='')
writer = csv.writer(f)

#Loop through each hash function
t0 = time.time()
for hash_idx in range(0, num_hash):
    
     # Calculate hash function value for all indices
    hash_value = (((coeffA[hash_idx] * np.array(range(0, max_value)) + coeffB[hash_idx]))) % max_value

    # Initialize sigature list
    sig_list = []
    
    # Loop through each hash function
    for song in range(0, len(tracks_to_playlist)):

        # Find minimum hash value
        indexes = tracks_to_playlist[song]
        min_idx = np.argmin(hash_value[indexes])
        
        # Append min hash value to signature list
        sig_list.append(hash_value[indexes[min_idx]])
        
    # Write row to signature matrix
    writer.writerow(sig_list)
    
    # Output status    
    if hash_idx % 1 == 0:
        print('Index ', hash_idx, ': ', (time.time()-t0))

# Close file
f.close()

Index  0 :  21.412058115005493
Index  1 :  40.10647368431091
Index  2 :  58.611759424209595
Index  3 :  77.16710424423218
Index  4 :  95.71180391311646
Index  5 :  114.34823083877563
Index  6 :  133.03998923301697
Index  7 :  151.60100173950195
Index  8 :  170.2222285270691
Index  9 :  188.86503505706787
