In [20]:
from annoy import AnnoyIndex
import pandas as pd
import numpy as np

def build_index():
    import re
    import pandas as pd
    import numpy as np

    def str_to_array(s):
        if s is None:
            return None
        s = s.strip()
        if s.startswith('[') and s.endswith(']'):
            s = s[1:-1]
        s = re.sub(r'[\r\n]+',' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        return np.fromstring(s, sep=' ').astype(float)

    df = pd.read_csv("./data/profile_embedding.csv")
    df['embedding'] = df['embedding'].apply(str_to_array)
    vector_dim = len(df.iloc[0]['embedding'])

    annoy_index = AnnoyIndex(vector_dim, 'euclidean')

    id_map = {}

    for idx, row in enumerate(df.itertuples()):
        annoy_index.add_item(idx, row.embedding)
        id_map[idx] = row.id

    annoy_index.build(10)
    return annoy_index

def top_n_similar(annoy_index, query_id, n=5, seen_ids=None):
    query_idx = [k for k, v in id_map.items() if v == query_id][0]
    embedding = df.loc[df['id'] == query_id, 'embedding'].values[0]
    candidates = annoy_index.get_nns_by_vector(embedding, n + 10)
    result_ids = []
    for i in candidates:
        candidate_id = id_map[i]
        if candidate_id != query_id and (seen_ids is None or candidate_id not in seen_ids):
            result_ids.append(candidate_id)
        if len(result_ids) >= n:
            break
    return result_ids


def already_seen(query_id):
    interactions = pd.read_csv("./data/interactions.csv", names=['datetime', 'viewer_id', 'viewer_name', 'profile_id','profile_name', 'status', 'score'])
    update = interactions.groupby('viewer_id')['profile_id'].apply(list).reset_index()
    update['viewer_id'] = update['viewer_id'].apply(lambda w: w.split("-")[-1])
    matched_profiles = update.loc[update['viewer_id'] == query_id, 'profile_id'].to_list()
    if matched_profiles:
        seen_ids = matched_profiles[0]
    else:
        seen_ids = None
    return seen_ids


annoy_index = build_index()
query_id = 'd8334dc0'
seen_ids = already_seen(query_id)
results = top_n_similar(annoy_index, query_id, n=5, seen_ids=seen_ids)
print("Top similar, unseen IDs:", results)



Top similar, unseen IDs: ['3222c13e']
