In [None]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x17070a6d0>

In [6]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

df_path = DF_DIR / "dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if df_path.exists():
    print("Loading dataset from CSV...")
    df = pd.read_csv(df_path)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the filepath
            file_path = DATA_DIR / filename
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(df_path, index=False)

print(df.head())

Loading dataset from CSV...
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alarming ter...  
1  I received it\nin a letter from Care2 organiza...  
2  Most of us have\nnever witnessed the crippling...  
3  Polio is still endemic in three of the world's...  
4  This is the scary truth: levels of polio are a...  


In [8]:
import joblib

version = 1

EMBEDDING_DIR = Path.cwd() / "aajonus_embeddings"
EMBEDDING_DIR.mkdir(exist_ok=True)

embeddings_path = EMBEDDING_DIR / f'sentence_embeddings_v{version}.joblib'

# Generate embeddings for the dataframe and save them
if not embeddings_path.exists():
    print("Generating embeddings for the dataset...")
    embeddings = model.encode(df['sentence'].tolist(), show_progress_bar=True)
    joblib.dump(embeddings, embeddings_path)
else:
    print("Loading embeddings from file...")
    embeddings = joblib.load(embeddings_path)

Generating embeddings for the dataset...


Batches:   0%|          | 0/3727 [00:00<?, ?it/s]

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(query, embeddings, df):
    # Encode the query to get its embedding
    query_embedding = model.encode([query]) 

    # Calculate cosine similarity between the query embedding and all sentence embeddings
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get the top 20 indices sorted by highest similarity scores
    top_indices = np.argsort(similarities)[-20:]

    # Retrieve the corresponding rows from the DataFrame
    top_docs = df.iloc[top_indices]
    top_scores = similarities[top_indices]

    return top_docs, top_scores

In [18]:
import time

test_set_columns=["Query", "Result", "Cosine", "Filename", "Date"]

def search_main(query, embeddings, df, test_set):
    top_docs, top_scores = search(query, embeddings, df)
    
    if top_docs.empty:
        print("No documents found for this query.")
        return test_set

    new_rows = []
    for (index, row), score in zip(top_docs.iterrows(), top_scores):
        new_row = {
            "Query": query,
            "Result": row['sentence'],
            "Cosine": score,
            "Filename": row['filename'],
            "Date": pd.Timestamp('now'),
        }
        new_rows.append(new_row)
    
    # Create a DataFrame from the new_rows list
    new_rows_df = pd.DataFrame(new_rows)
    
    # Append the new_rows_df to the test_set
    test_set = pd.concat([test_set, new_rows_df], ignore_index=True)
    
    return test_set

In [19]:
def generate_test_set_from_queries(query_file_path, embeddings, df):
    test_set = pd.DataFrame(columns=test_set_columns)
    with open(query_file_path, 'r') as file:
        queries = file.read().splitlines()
    
    for query in queries:
        test_set = search_main(query, embeddings, df, test_set)
    
    return test_set

TEST_SET_DIR = Path.cwd() / "aajonus_test_sets"
TEST_SET_DIR.mkdir(exist_ok=True)
test_set_path = TEST_SET_DIR / f"test_set_v{version}_embedding.csv"
query_file_path = TEST_SET_DIR / "queries.txt"

test_set = generate_test_set_from_queries(query_file_path, embeddings, df)

test_set.to_csv(test_set_path, index=False)

In [20]:
import re

def normalize_sentence(sentence):
    # Function to normalize sentences for comparison
    # Remove punctuation and extra spaces, and convert to lowercase
    return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', sentence)).strip().lower()

def compute_evaluations(test_set, relevant_results):
    evaluation_data = []

    for query_group in relevant_results['Query'].unique():
        grouped_queries = query_group.split(',')
        relevant_set = set([normalize_sentence(sentence) for sentence in relevant_results[relevant_results['Query'] == query_group]['Result']])
        
        total_hits = 0
        matching_sentences = []

        for query in grouped_queries:
            query = query.strip()
            query_results = test_set[test_set['Query'] == query]['Result'].apply(normalize_sentence)

            # Count matching sentences, including duplicates
            for sentence in query_results:
                if sentence in relevant_set:
                    total_hits += 1
                    matching_sentences.append(sentence)

        evaluation_data.append({
            'Query Group': query_group,
            'Total Hits': total_hits,
            'Matching Sentences': ', '.join(matching_sentences)
        })
    
    eval_df = pd.DataFrame(evaluation_data)
    
    return eval_df

EVAL_DIR = Path.cwd() / "aajonus_evaluations"
EVAL_DIR.mkdir(exist_ok=True)

relevant_results_path = TEST_SET_DIR / "relevant_query_results.csv"
relevant_results = pd.read_csv(relevant_results_path)

test_set_path = TEST_SET_DIR / f"test_set_v{version}_embedding.csv"
test_set = pd.read_csv(test_set_path)

eval_df = compute_evaluations(test_set, relevant_results)
eval_path = EVAL_DIR / f"evaluation_v{version}_embedding.csv"
eval_df.to_csv(eval_path, index=False)

print(eval_df.head())

                                         Query Group  Total Hits  \
0  Is salt unhealthy, Salt damages cells,\n Why i...           9   
1    What are signs of intelligence, Genius and diet           3   
2                        How to gain weight quickly            0   
3     What is arthritis, What is arthritis caused by           1   
4          What does high meat do, Why eat high meat           2   

                                  Matching Sentences  
0  some people are very allergic to salt, some pe...  
1  being rational does not make you intelligent o...  
2                                                     
3  ten percent of arthritis is from other toxins ...  
4  so thats what high meat can do for you, so tha...  
