In [None]:
import os
import json
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.field import VectorField, TextField, NumericField

# Import Data

In [None]:
# unzip data
if not os.path.exists("../data/fma-metadata/tracks_small" + ".*"):
        with zipfile.ZipFile("../data/fma-metadata/tracks_small.zip", 'r') as zip_ref:
            zip_ref.extractall("../data/fma-metadata/")
if not os.path.exists("../data/vectors_test_data" + ".*"):
        with zipfile.ZipFile("../data/vectors_test_data.zip", 'r') as zip_ref:
            zip_ref.extractall("../data/")

In [None]:
metadata = pd.read_csv("../data/fma-metadata/tracks_small.csv", index_col=0, header=[0,1])
vectors = pd.read_csv("../data/vectors_test_data.csv", index_col=0, header=[0,1], sep=",")
df = pd.merge(metadata, vectors, left_index=True, right_index=True)
df.head()

In [None]:
for track_id, row in tqdm(df.iloc[:,:].iterrows()):

    row = row.replace({pd.NaT: "null"})

    feature_vector_str = row["feature", "vector"]
    feature_vector = np.array(json.loads(feature_vector_str))

# print(feature_vector_str)

# Redis Connection

In [None]:
redis_conn = Redis(host=os.environ.get('REDIS_ADDRESS', 'localhost'), port=6379, password=os.environ.get('REDIS_PASSWORD', None))

# Redis Database Definition

### Definition of Fields

In [None]:
index_name = "audiosimilarity"
distance_metric:str="COSINE"
DIM = 1000

track_title = TextField(name="track_title")
album_title = TextField(name="album_title")
artist_name = TextField(name="artist_name")
track_publisher = TextField(name="track_publisher")

album_tracks = NumericField(name="album_tracks")
bit_rate = NumericField(name="bit_rate")
duration = NumericField(name="duration")
genre_top = TextField(name="genre_top")

language_code = TextField(name="language_code")
album_date_released = TextField(name="album_date_released")

feature_vector_text = TextField(name="feature_vector_text")
feature_vector = VectorField("feature_vector",
            "HNSW", {
                "TYPE": "FLOAT32",
                "DIM": DIM,
                "DISTANCE_METRIC": distance_metric,
                "INITIAL_CAP": 10000,
            })

### Create index

In [None]:
redis_conn.ft(index_name).create_index(
    fields = [track_title, album_title, artist_name, track_publisher, album_tracks, bit_rate, duration, genre_top, language_code, album_date_released, feature_vector_text, feature_vector],
    definition = IndexDefinition(prefix=[index_name], index_type=IndexType.HASH)
)

# Populate Database

In [None]:
for track_id, row in tqdm(df.iloc[:,:].iterrows()):

    row = row.replace({pd.NaT: "null"})

    feature_vector_text = row["feature", "vector"]
    feature_vector = np.array(json.loads(feature_vector_text))


    redis_conn.hset(
        f"{index_name}:{track_id}",
        mapping={
            "track_title": row["track", "title"],
            "album_title": row["album", "title"],
            "artist_name": row["artist", "name"],
            "track_publisher": row["track", "publisher"],
            "album_tracks":  row["album", "tracks"],
            "bit_rate": row["track", "bit_rate"],
            "duration": row["track", "duration"],
            "genre_top": row["track", "genre_top"],
            "language_code": row["track", "language_code"],
            "album_date_released": row["album", "date_released"],
            "feature_vector_text": feature_vector_text,
            "feature_vector": feature_vector.astype(dtype=np.float32).tobytes()
        }
    )

In [None]:
# redis_conn.keys()

# Test Query

In [None]:
redis_conn.hgetall("audiosimilarity:37423")

In [None]:
def base_query(number_of_results=20):
    base_query = f'*'
    query = Query(base_query)\
        .paging(0, number_of_results)\
        .dialect(2)
    
    results = redis_conn.ft(index_name).search(query)

    return pd.DataFrame(list(map(lambda x: x.__dict__, results.docs)))

In [None]:
results = base_query(5)
results

In [None]:
def vector_similarity(np_vector:np.array, search_type:str="KNN", number_of_results:int=10, vector_field_name:str="feature_vector"):
    base_query = f'* =>[ {search_type} {number_of_results} @{vector_field_name} $vec_param AS vector_score]'

    query = Query(base_query)\
        .sort_by("vector_score", asc=False)\
        .paging(0, number_of_results)\
        .dialect(2)

    params_dict = {"vec_param": np_vector.astype(dtype=np.float32).tobytes()}

    results = redis_conn.ft(index_name).search(query, params_dict)
    
    return pd.DataFrame(list(map(lambda x: x.__dict__, results.docs)))

In [None]:
vec = np.random.rand(DIM)
results = vector_similarity(vec, number_of_results=10)
results

## Convert (byte)string back to array

In [None]:
features = json.loads(results["feature_vector_text"][0])
type(features), features