In [1]:
import os
import json
from pathlib import Path
import copy
import warnings
from collections import defaultdict
from collections import OrderedDict
from collections import Counter
import datetime

import numpy as np
import pandas as pd

import faiss


## Data Import

In [3]:
# Data import

data_path = 'Data/embedded_ticket_df.json'
embedding_cols = ['Query embedding', 'Answer embedding']
dtypes = {'Query' : str, 'Answer' : str, 'Query embedding' : list, 'Answer embedding' : list}

ticket_embedded_df = pd.read_json(data_path, orient = 'index', typ = 'frame', dtype = dtypes, precise_float = True)

# Convert specific columns from lists to NumPy arrays

for col in embedding_cols:
    ticket_embedded_df[col] = ticket_embedded_df[col].apply(lambda x: np.array(x, dtype = np.float32))

ticket_embedded_df.head()


Unnamed: 0,Query,Answer,Query embedding,Answer embedding
t6UJ9A00GHH8,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHH8\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.090852164, -0.07308326, -0.015302544, 0.060...","[0.030940523, -0.114069894, -0.019030234, 0.01..."
t6UJ9A00GFTK,Ticket metadata:\n\n Ticket ID: t6UJ9A00GFTK\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.007294609, -0.0129725095, 0.0022708774, -0....","[-0.052718893, -0.018210715, -0.06322326, 0.01..."
t6UJ9A00GGSG,Ticket metadata:\n\n Ticket ID: t6UJ9A00GGSG\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.07374815, 0.014548237, 0.0688222, -0.010902...","[0.032509133, -0.071108855, -0.039874528, 0.03..."
t6UJ9A00GHDL,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHDL\...,Solution metadata:\n \n Urgency code: 1\n Re...,"[0.064611726, -0.011558312, 0.083025865, 0.069...","[-0.020230183, 0.016113998, 0.048159372, 0.058..."
t6UJ9A00GHHG,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHHG\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[-0.029977296, 0.06212735, -0.015313174, -0.03...","[-0.045093242, -0.022921918, 0.025591739, -0.0..."


## Vector Storage

In [4]:
# Define a custom FAISS class for dual search and custom IDs

class FAISSdb:
    
    def __init__(self, index_q, index_a, id_map, dataset = None):
        
        self.index_q = index_q
        self.index_a = index_a
        self.id_map = id_map
        self.counter = max(self.id_map.keys()) if len(self.id_map) > 0 else -1

        self.dataset = dataset

    def add(self, vectors_q, vectors_a, custom_ids):

        faiss.normalize_L2(vectors_q)
        faiss.normalize_L2(vectors_a)
        
        self.index_q.add(vectors_q)
        self.index_a.add(vectors_a)
        
        for _id in custom_ids:
            self.counter += 1
            self.id_map[self.counter] = _id

    def search(self, query, k = 5):
        
        query = np.ascontiguousarray(query.astype('float32'))
        faiss.normalize_L2(query)
        
        sim_q, ids_q = self.index_q.search(query, k)
        results_q = [[(self.id_map[docid], docsim) for docsim, docid in zip(qsim, qids) if docid != -1]
                           for qsim, qids in zip(sim_q, ids_q)]

        sim_a, ids_a = self.index_a.search(query, k)
        results_a = [[(self.id_map[docid], docsim) for docsim, docid in zip(asim, aids) if docid != -1]
                       for asim, aids in zip(sim_a, ids_a)]
        
        return results_q, results_a

    def rank(self, query, query_sim_weight = 0.5, k = 6, global_k = 100):

        id_list = []
        sim_list = []

        global_k = global_k if self.ntotal() >= global_k else self.ntotal()

        results_q, results_a = self.search(query, k = global_k)

        for i in range(len(results_q)):

            qids, qsims = zip(*results_q[i])
            aids, asims = zip(*results_a[i])

            aids_pos = {val : j for j, val in enumerate(aids)}
    
            aligned_asims = []
    
            for val in qids:
                if val in aids_pos:
                    aligned_asims.append(asims[aids_pos[val]])
                else:
                    aligned_asims.append(-1)
            
            weighted_sim = query_sim_weight * np.array(qsims) + (1 - query_sim_weight) * np.array(aligned_asims)
            
            sort_array = list(reversed(np.argsort(weighted_sim)))

            id_list.append(list(np.array(qids)[sort_array][:k + 1]))
            sim_list.append(list(weighted_sim[sort_array][:k + 1]))

        return id_list, sim_list

    def get_metadata(self, ids: list):
        
        if isinstance(self.dataset, pd.DataFrame):
            return self.dataset.loc[ids, :]
        else:
            print('No metadata to retrieve')
            return []
    
    def create_context_df(self, df, query_embed_col, query_sim_weight = 0.5, k = 6):

        query_embeds = np.ascontiguousarray(np.array(df[query_embed_col].tolist()).astype('float32'))
        faiss.normalize_L2(query_embeds)

        id_list, sim_list = self.rank(query_embeds, query_sim_weight = query_sim_weight, k = k)

        context_df = pd.DataFrame(
            {'context_ids' : id_list,
             'context_sims' : sim_list},
            index = df.index.tolist())

        return pd.concat([df, context_df], axis = 1)        
    
    def ntotal(self):
        return self.counter + 1

    def write(self, path, file_q, file_a, file_map):

        faiss.write_index(self.index_q, str(path.joinpath(file_q)))
        faiss.write_index(self.index_a, str(path.joinpath(file_a)))

        with open(path.joinpath(file_map), 'w') as file:
            file.write(json.dumps(self.id_map, indent = 2))

    @staticmethod
    def read(path, file_q, file_a, file_map, reinitialize_vector_index = False, embed_q_dim = 384, embed_a_dim = 384):

        path.mkdir(parents = True, exist_ok = True)
        
        if reinitialize_vector_index \
            or q_index_file_name not in os.listdir(faiss_subdir) \
                or a_index_file_name not in os.listdir(faiss_subdir) \
                    or index_db_id_map_file_name not in os.listdir(faiss_subdir):
        
            index_q = faiss.IndexFlatIP(embed_q_dim)
            index_a = faiss.IndexFlatIP(embed_a_dim)
        
            index_db_id_map = {}
        
        else:
        
            index_q = faiss.read_index(str(faiss_subdir.joinpath(q_index_file_name)))
            index_a = faiss.read_index(str(faiss_subdir.joinpath(a_index_file_name)))

            def json_import_dtypes(d, key_fn = int, value_fn = str):
                return {key_fn(k) if k.isdigit() else k: value_fn(v) for k, v in d.items()}

            with open(faiss_subdir.joinpath(index_db_id_map_file_name), 'r') as file:
                index_db_id_map = json.load(file, object_hook = json_import_dtypes)

        return index_q, index_a, index_db_id_map
        

In [10]:
# FAISS index files

reinitialize_vector_index = False
add_new_vectors = False

faiss_subdir = Path.cwd().joinpath('FAISS')
q_index_file_name = 'ticket_query_embeddings.index'
a_index_file_name = 'ticket_answer_embeddings.index'
index_db_id_map_file_name = 'index_id_db_map.json'

# Instantiate FAISS vector search index

ids = np.array(ticket_embedded_df.index.tolist())

embeddings_q = np.array(ticket_embedded_df['Query embedding'].tolist()).astype('float32')
embeddings_a = np.array(ticket_embedded_df['Answer embedding'].tolist()).astype('float32')

# Load or create exact similarity indices

index_q, index_a, index_db_id_map = FAISSdb.read(
    faiss_subdir,
    q_index_file_name,
    a_index_file_name,
    index_db_id_map_file_name,
    reinitialize_vector_index = reinitialize_vector_index,
    embed_q_dim = embeddings_q.shape[-1], embed_a_dim = embeddings_a.shape[-1]
)

faiss_db = FAISSdb(index_q, index_a, index_db_id_map)

# Add vectors

if add_new_vectors:
    faiss_db.add(embeddings_q, embeddings_a, ids)
    
print('Stored vectors:', faiss_db.ntotal())


Stored vectors: 5200


In [6]:
# Save a current db state

faiss_db.write(
    faiss_subdir,
    q_index_file_name,
    a_index_file_name,
    index_db_id_map_file_name
)


In [7]:
# Create and save dataframe, mapping context indices to query vectors 

context_embedded_df = faiss_db.create_context_df(ticket_embedded_df, 'Query embedding', query_sim_weight = 0.6, k = 6)
context_embedded_df.to_json('Data/embedded_context_df.json', orient = 'index', double_precision = 15, index = True)


In [21]:
context_embedded_df.head()

Unnamed: 0,Query,Answer,Query embedding,Answer embedding,context_ids,context_sims
t6UJ9A00GHH8,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHH8\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.090852164, -0.07308326, -0.015302544, 0.060...","[0.030940523, -0.114069894, -0.019030234, 0.01...","[t6UJ9A00GHH8, t6UJ9A00GB3Q, t6UJ9A00FW8D, t6U...","[0.8623855948448181, 0.7062625050544739, 0.699..."
t6UJ9A00GFTK,Ticket metadata:\n\n Ticket ID: t6UJ9A00GFTK\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.007294609, -0.0129725095, 0.0022708774, -0....","[-0.052718893, -0.018210715, -0.06322326, 0.01...","[t6UJ9A00GFTK, t6UJ9A00G71J, t6UJ9A00FZLY, t6U...","[0.8002088308334351, 0.540742039680481, 0.5401..."
t6UJ9A00GGSG,Ticket metadata:\n\n Ticket ID: t6UJ9A00GGSG\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[0.07374815, 0.014548237, 0.0688222, -0.010902...","[0.032509133, -0.071108855, -0.039874528, 0.03...","[t6UJ9A00GGSG, t6UJ9A00F52X, t6UJ9A00FZTT, t6U...","[0.7953567147254944, 0.5295654296875, 0.529249..."
t6UJ9A00GHDL,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHDL\...,Solution metadata:\n \n Urgency code: 1\n Re...,"[0.064611726, -0.011558312, 0.083025865, 0.069...","[-0.020230183, 0.016113998, 0.048159372, 0.058...","[t6UJ9A00GHDL, t6UJ9A00FLGJ, t6UJ9A00F6PA, t6U...","[0.8399720907211303, 0.5847500264644623, 0.561..."
t6UJ9A00GHHG,Ticket metadata:\n\n Ticket ID: t6UJ9A00GHHG\...,Solution metadata:\n \n Urgency code: 3\n Re...,"[-0.029977296, 0.06212735, -0.015313174, -0.03...","[-0.045093242, -0.022921918, 0.025591739, -0.0...","[t6UJ9A00GHHG, t6UJ9A00FUTI, t6UJ9A00G0SP, t6U...","[0.8045737266540527, 0.7800474405288697, 0.755..."
