In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

vi_data_df = pd.read_csv("./vi_text_retrieval.csv")
context = vi_data_df['text']
context = [doc.lower() for doc in context]

tfidf_vectorizer = TfidfVectorizer()
context_embedded = tfidf_vectorizer.fit_transform(context)
context_embedded.toarray()[7][0]


0.3112658076071063

In [6]:
def tfidf_search(question, tfidf_vectorizer, top_d=5):
    # lowercasing before encoding
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    cosine_scores = cosine_similarity(query_embedded, context_embedded).flatten()
    
    # Get top k cosine score and index its
    results = []
    for idx in cosine_scores.argsort()[-top_d:][::-1]:
        doc_score = {
            'id': idx,
            'cosine_score': cosine_scores[idx]
        }
        results.append(doc_score)
    return results

question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, tfidf_vectorizer, top_d=5)
results[0]['cosine_score']


0.6279910475266973

In [38]:
from scipy.stats import pearsonr

def corr_search(question, tfidf_vectorizer, top_d=5):
    # lowercasing before encoding
    query_embedded_array = tfidf_vectorizer.transform([question.lower()]).toarray().flatten()
    context_embedded_array = context_embedded.toarray()
    #print(type(query_embedded_array),query_embedded_array.shape , type(context_embedded_array), context_embedded_array[0].shape)
    corr_scores = [pearsonr(query_embedded_array, context)[0] for context in context_embedded_array]
   
    results = []
    for idx, score in enumerate(sorted(corr_scores,reverse=True)):
        doc = {
            'id': idx,
            'corr_score': score
        }
        results.append(doc)
    print(results)
    return results

question = vi_data_df.iloc[0]['question']
results = corr_search(question, tfidf_vectorizer, top_d=5)
results[1]['corr_score']


[{'id': 0, 'corr_score': 0.6259599752568744}, {'id': 1, 'corr_score': 0.20734246471972953}, {'id': 2, 'corr_score': 0.17124615520165098}, {'id': 3, 'corr_score': 0.1552034605469979}, {'id': 4, 'corr_score': 0.154445319966633}, {'id': 5, 'corr_score': 0.13898841948520851}, {'id': 6, 'corr_score': 0.09171846431384703}, {'id': 7, 'corr_score': 0.0889458142432999}, {'id': 8, 'corr_score': 0.08699546869427933}, {'id': 9, 'corr_score': 0.08508517160933857}, {'id': 10, 'corr_score': 0.08413713631572638}, {'id': 11, 'corr_score': 0.08271000029285719}, {'id': 12, 'corr_score': 0.08268629739103933}, {'id': 13, 'corr_score': 0.08136911428312044}, {'id': 14, 'corr_score': 0.07970448448979572}, {'id': 15, 'corr_score': 0.07880299833911276}, {'id': 16, 'corr_score': 0.07618938374014121}, {'id': 17, 'corr_score': 0.07465708131417546}, {'id': 18, 'corr_score': 0.07241473505523968}, {'id': 19, 'corr_score': 0.06847759084860283}, {'id': 20, 'corr_score': 0.06847759084860283}, {'id': 21, 'corr_score': 0.

0.20734246471972953