In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
vi_data_df = pd.read_csv("./vi_text_retrieval.csv")
context = vi_data_df['text']
context = [doc.lower() for doc in context]

tfidf_vectorizer = TfidfVectorizer()
context_embedded = tfidf_vectorizer.fit_transform(context)
context_embedded.toarray()[7][0]

0.31126580760710637

In [73]:
def tfidf_search(question, context_embedded, tfidf_vectorizer, top_d=5):
    # lowercasing before encoding
    query_embedded = tfidf_vectorizer.transform([question])
    
    # Cosine similarity between the query and all documents
    cosine_scores = cosine_similarity(query_embedded, context_embedded).flatten()
    
    # Get top k cosine score and index its
    results = []
    for idx in cosine_scores.argsort()[-top_d:][::-1]:
        doc_score = {
            'id': idx,
            'cosine_score': cosine_scores[idx]
        }
        results.append(doc_score)
    return results

# Example usage
question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, context_embedded, tfidf_vectorizer, top_d=5)
results[0]['cosine_score']

0.6279910475266974

In [76]:
def corr_search(question, context_embedded, tfidf_vectorizer, top_d=5):
    # lowercasing before encoding
    query_embedded = tfidf_vectorizer.transform([question])
    
    # Calculate cosine similarity between the query and all documents
    corr_scores = cosine_similarity(query_embedded, context_embedded)
    
    # Flatten the correlation scores array and exclude the first element (self-similarity)
    corr_scores = corr_scores[0]
    
    # Get top k correlation scores and their indices
    results = []
    for idx in corr_scores.argsort()[-top_d:][::-1]:
        doc = {
            'id': idx,
            'corr_score': corr_scores[idx]
        }
        results.append(doc)
    
    return results

# Example usage
question = vi_data_df.iloc[0]['question']
results = corr_search(question, context_embedded, tfidf_vectorizer, top_d=5)
print(results[1]['corr_score'])  # Prints the correlation score of the second result


0.2114579586225163
