In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.field import VectorField, TagField, TextField, NumericField

# Import Data

In [8]:
# df = pd.read_csv("../fma-metadata/tracks_small.csv", index_col=0, header=[0,1])
df = pd.read_csv("../fma-metadata/tracks.csv", index_col=0, header=[0,1])

df = df.head(10)
df

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level
26,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,1060,en,Attribution-NonCommercial-NoDerivatives (aka M...,193,,4,,[],Where is your Love?
30,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,718,en,Attribution-NonCommercial-NoDerivatives (aka M...,612,,5,,[],Too Happy
46,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,252,en,Attribution-NonCommercial-NoDerivatives (aka M...,171,,8,,[],Yosemite
48,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,247,en,Attribution-NonCommercial-NoDerivatives (aka M...,173,,9,,[],Light of Light
134,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music


# Redis Connection

In [3]:
redis_conn = Redis(host=os.environ.get('REDIS_ADDRESS', 'localhost'), port=6379, password=os.environ.get('REDIS_PASSWORD', None))

# Redis Database Definition

### Definition of Fields

In [37]:
index_name = "audiosimilarityv2"
distance_metric:str="COSINE"
DIM = 100

track_title = TextField(name="track_title")
album_title = TextField(name="album_title")
artist_name = TextField(name="artist_name")
track_publisher = TextField(name="track_publisher")

album_tracks = NumericField(name="album_tracks")
bit_rate = NumericField(name="bit_rate")
duration = NumericField(name="duration")
genre_top = TextField(name="genre_top")

language_code = TextField(name="language_code")
album_date_released = TextField(name="album_date_released")
feature_vector_text = TextField(name="feature_vector_text")

feature_vector = VectorField("feature_vector",
            "HNSW", {
                "TYPE": "FLOAT32",
                "DIM": DIM,
                "DISTANCE_METRIC": distance_metric,
                "INITIAL_CAP": 10000,
            })

### Create index

In [38]:
redis_conn.ft(index_name).create_index(
    fields = [track_title, album_title, artist_name, track_publisher, album_tracks, bit_rate, duration, genre_top, language_code, album_date_released, feature_vector_text, feature_vector],
    definition = IndexDefinition(prefix=[index_name], index_type=IndexType.HASH)
)

b'OK'

In [39]:
random_vector = np.random.rand(DIM)
random_vector.astype(dtype=np.float32)

import json

test_str = json.dumps(list(random_vector))

# Populate Database

In [41]:
for track_id, row in tqdm(df.iloc[:,:].iterrows()):

    row = row.replace({pd.NaT: "null"})
    
    random_vector = np.random.rand(DIM).astype(dtype=np.float32).tobytes()
    random_vector = np.random.rand(DIM)
    random_vector.astype(dtype=np.float32)
    test_str = json.dumps(list(random_vector))
    

    redis_conn.hset(
        f"{index_name}:{track_id}",
        mapping={
            "track_title": row["track", "title"],
            "album_title": row["album", "title"],
            "artist_name": row["artist", "name"],
            "track_publisher": row["track", "publisher"],
            "album_tracks":  row["album", "tracks"],
            "bit_rate": row["track", "bit_rate"],
            "duration": row["track", "duration"],
            "genre_top": row["track", "genre_top"],
            "language_code": row["track", "language_code"],
            "album_date_released": row["album", "date_released"],
            "feature_vector_text": test_str,
            "feature_vector": random_vector.astype(dtype=np.float32).tobytes()
        }
    )

10it [00:00, 233.97it/s]


# Test Query

In [52]:
# redis_conn.keys()
redis_conn.hgetall("audiosimilarityv2:2")

{b'feature_vector_text': b'[0.5768994975483053, 0.499887054418759, 0.34117725684216627, 0.1284318612579738, 0.47140404420324544, 0.2643568226944891, 0.18187905653417968, 0.2364914411171214, 0.4533100693078047, 0.5298069719964643, 0.8344059153135176, 0.9125326115999195, 0.20855144766115774, 0.2433748646466941, 0.9292930903454681, 0.2719190391056211, 0.9483168187220316, 0.8451843023644683, 0.12044669957649579, 0.23993147641778556, 0.970157177848113, 0.6905532140820911, 0.013467213844582937, 0.4951587367903001, 0.6694830821176782, 0.1125631218434332, 0.4344613262242938, 0.26121732840183565, 0.9745950962661815, 0.7165156270940535, 0.39467619316730673, 0.13180765996704036, 0.5791365577401926, 0.8233399379487828, 0.3071288103358596, 0.38064611315579955, 0.27472045276671186, 0.6806422756235433, 0.5761520676660212, 0.7358260896926306, 0.01529724452793979, 0.3851722331208519, 0.16506981865920534, 0.993855056054978, 0.7786405522846099, 0.05602190951862507, 0.6093590819278644, 0.47266766185776943

In [55]:
def base_query(number_of_results=20):
    base_query = f'*'
    query = Query(base_query)\
        .paging(0, number_of_results)\
        .dialect(2)
    
    results = redis_conn.ft(index_name).search(query)

    if results.docs:
        return pd.DataFrame(list(map(lambda x: {'id': x.id, 'track' : x.track_title, 'album': x.album_title, 'artist': x.artist_name, "feature_vector_text": x.feature_vector_text}, results.docs))).sort_values(by='id')
        # return results.docs
    else:
        return pd.DataFrame()

In [60]:
base_query(5)["feature_vector_text"][0]

'[0.5768994975483053, 0.499887054418759, 0.34117725684216627, 0.1284318612579738, 0.47140404420324544, 0.2643568226944891, 0.18187905653417968, 0.2364914411171214, 0.4533100693078047, 0.5298069719964643, 0.8344059153135176, 0.9125326115999195, 0.20855144766115774, 0.2433748646466941, 0.9292930903454681, 0.2719190391056211, 0.9483168187220316, 0.8451843023644683, 0.12044669957649579, 0.23993147641778556, 0.970157177848113, 0.6905532140820911, 0.013467213844582937, 0.4951587367903001, 0.6694830821176782, 0.1125631218434332, 0.4344613262242938, 0.26121732840183565, 0.9745950962661815, 0.7165156270940535, 0.39467619316730673, 0.13180765996704036, 0.5791365577401926, 0.8233399379487828, 0.3071288103358596, 0.38064611315579955, 0.27472045276671186, 0.6806422756235433, 0.5761520676660212, 0.7358260896926306, 0.01529724452793979, 0.3851722331208519, 0.16506981865920534, 0.993855056054978, 0.7786405522846099, 0.05602190951862507, 0.6093590819278644, 0.47266766185776943, 0.5243206852606367, 0.16

In [62]:
features = json.loads(base_query(5)["feature_vector_text"][0])
print(type(features))
print(features)

<class 'list'>
[0.5768994975483053, 0.499887054418759, 0.34117725684216627, 0.1284318612579738, 0.47140404420324544, 0.2643568226944891, 0.18187905653417968, 0.2364914411171214, 0.4533100693078047, 0.5298069719964643, 0.8344059153135176, 0.9125326115999195, 0.20855144766115774, 0.2433748646466941, 0.9292930903454681, 0.2719190391056211, 0.9483168187220316, 0.8451843023644683, 0.12044669957649579, 0.23993147641778556, 0.970157177848113, 0.6905532140820911, 0.013467213844582937, 0.4951587367903001, 0.6694830821176782, 0.1125631218434332, 0.4344613262242938, 0.26121732840183565, 0.9745950962661815, 0.7165156270940535, 0.39467619316730673, 0.13180765996704036, 0.5791365577401926, 0.8233399379487828, 0.3071288103358596, 0.38064611315579955, 0.27472045276671186, 0.6806422756235433, 0.5761520676660212, 0.7358260896926306, 0.01529724452793979, 0.3851722331208519, 0.16506981865920534, 0.993855056054978, 0.7786405522846099, 0.05602190951862507, 0.6093590819278644, 0.47266766185776943, 0.52432068

In [45]:
def vector_similarity(np_vector: np.array, return_fields: list=[], search_type: str="KNN", number_of_results: int=10, vector_field_name: str="feature_vector"):
    base_query = f'* =>[ {search_type} {number_of_results} @{vector_field_name} $vec_param AS vector_score]'

    query = Query(base_query)\
        .sort_by("vector_score", asc=False)\
        .dialect(2)

    params_dict = {"vec_param": np_vector.astype(dtype=np.float32).tobytes()}

    results = redis_conn.ft(index_name).search(query, params_dict)
    
    # return pd.DataFrame(list(map(lambda x: {'id' : x.id, 'track_title': x.album_title, 'vector_score': x.vector_score}, results.docs)))
    # return pd.DataFrame(list(map(lambda x: x.__dict__, results.docs)))
    return results.docs

In [46]:
vec = np.random.rand(DIM)
res = vector_similarity(vec)
res

[Document {'id': 'audiosimilarityv2:26', 'payload': None, 'vector_score': '0.312813401222', 'feature_vector_text': '[0.3513382296275811, 0.5289473199054504, 0.9291444784865808, 0.41570477363217706, 0.02944758139747994, 0.9023698198926223, 0.07967038414586403, 0.025869847801512802, 0.986304112135726, 0.1529961846582919, 0.0808310549811746, 0.8291525647791447, 0.3426400908295608, 0.7994623711600878, 0.6970434413788307, 0.42504917370761064, 0.46778514963826423, 0.08878900009824642, 0.6122569367645093, 0.13970253560603274, 0.0288985184517474, 0.5266501090908671, 0.5060405910047818, 0.6798985484257973, 0.8813188680325288, 0.17605514312604953, 0.8422485925715145, 0.6913059431208656, 0.5987920205216115, 0.8323137967254458, 0.585935333995726, 0.9873550628239496, 0.9470582697990902, 0.19904962372187784, 0.8032837781796124, 0.6351396351517911, 0.7862869449931191, 0.3251791111295732, 0.024105913412282076, 0.9185862817904883, 0.21777647624184104, 0.6836940593028678, 0.49342754835253666, 0.38692785

## Convert (byte)string back to array

In [47]:
vector_bytes_str = res[0]["feature_vector"]
vector_bytes_str

'>\x17i\x07?jm?B>\x0e<<\x01g?:*=\x01<m~|?\x08\x1c>=XCT?n>L?pq2?\x0c>>\x02\u05f5=\u07fc\x1c?.\x0e\x0f><\x06?\x01?\r.?\x1da?G4>W?m0?oJ\x19?\x12U?\x15?M|?irr?K>\x01M?"?\x1aJI?}>y<x(k?\x00_>\x06/?>h\x1b>Y]?i<?\t>>*]?ajR?w\x18>>&\x17?0>)]?\x184?>\x08>GV>{>k\x16<p=㬋>\x0f\\?\x00\x1d?#>>JB?y?x[?4V\'?<\x14v?>\x1b??M>?>\x1f?{=>}.?\x0b\x7fD?\x11\x7f?f?$b>|>\n>>\x15\\>>\x12֮>\x19\x13?Or>$r?Td\x0e?B\'Y>\x07=?2\x17?ݴ:>JT$?W>'

In [48]:
vector_bytes_str_enc = vector_bytes_str.encode()
type(vector_bytes_str_enc), len(vector_bytes_str_enc)

(bytes, 267)

In [49]:
bytes_np_dec = vector_bytes_str_enc.decode('unicode-escape').encode('ISO-8859-1')[2:-1]
type(bytes_np_dec), len(bytes_np_dec)

  bytes_np_dec = vector_bytes_str_enc.decode('unicode-escape').encode('ISO-8859-1')[2:-1]


(bytes, 264)

In [50]:
np.frombuffer(bytes_np_dec, dtype=np.float32)

array([ 5.7734956e+25,  1.8969508e-01,  3.4573234e-38,  1.6542116e-13,
        3.6365424e+27,  5.7623312e-34,  8.3229782e+14,  1.4803376e+28,
        2.3681523e+29,  1.4729284e-31, -1.4320384e+14, -2.7251104e-02,
        2.1477542e-30,  3.5404464e-35,  5.8857919e-31,  1.8129321e+20,
        1.7605303e-01,  8.6310109e-10,  1.0465621e-23,  8.3230966e-01,
        4.2628012e+36,  4.8014469e+30,  3.4951475e-38,  6.3377839e-01,
        7.8628695e-01,  1.5212652e-02,  9.1858625e-01,  3.5804823e-35,
        3.5936607e+24,  9.7837369e+17,  7.3598093e-01,  1.6896913e-13,
        6.8076884e+25,  3.1955970e-24,  5.3716054e-25,  4.2230322e-14,
        1.4179155e-07,  1.3304995e-01,  9.8828632e+35,  9.1808420e-03,
       -6.4585490e-12,  1.6127875e+17,  6.1328501e-01,  3.1169368e+06,
        9.7362149e-01,  1.7821537e-07,  1.1667093e-02,  1.8697387e-01,
        2.0053650e+08,  4.0286378e-20,  1.8504046e-01,  3.6820236e-32,
        1.5088350e-28,  8.9940637e-01,  3.9541077e+36,  1.8558595e-01,
      

In [51]:
# https://stackoverflow.com/questions/66663132/valueerror-buffer-size-must-be-a-multiple-of-element-size-when-converting-from
vector_bytes = vector_np.tobytes()
vector_bytes_str = str(vector_bytes)
vector_bytes_str_enc = vector_bytes_str.encode()
bytes_np_dec = vector_bytes_str_enc.decode('unicode-escape').encode('ISO-8859-1')[2:-1]
np.frombuffer(bytes_np_dec, dtype=np.float64)

NameError: name 'vector_np' is not defined

In [None]:
# hier funktionierts, vl mcaht redis zwischendurch noch was kaputt
test = np.random.rand(DIM)
testb = test.astype(dtype=np.float32).tobytes()
testbstr = str(testb)
test_enc = testbstr.encode()
test_dec = test_enc.decode('unicode-escape').encode('ISO-8859-1')[2:-1]
np.frombuffer(test_dec, dtype=np.float32)