In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x164e69f10>

In [6]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

df_path = DF_DIR / "dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if df_path.exists():
    print("Loading dataset from CSV...")
    df = pd.read_csv(df_path)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the filepath
            file_path = DATA_DIR / filename
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(df_path, index=False)

print(df.head())

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

With_Mercury_Found_In_Wild_Animals,_Do_We_Need_To_Be_Extra_Careful?.txt
Q&A_Of_September_26,_2010.txt
Do_You_Buy_Chicken_While_Traveling?.txt
Cancer_Convention_September_2000.txt
Q&A_Of_November_7,_1999.txt
Q&A_Of_November_26,_2006.txt
How_Bad_Are_MRIs?.txt
Arsenic_In_Poultry_Meat_And_Eggs.txt
Joanne_Unleahsed_Interview.txt
Declaring_Our_Rights_To_Our_Body.txt
We_Want_To_Live.txt
Soy_Toxicity_In_Poultry_Meat_And_Eggs.txt
Hot_Tub_Therapy.txt
Bacteria_and_Other_Microbes_Are_Responsible_for_Vibrant_Health.txt
Gum_And_Tooth_Disease.txt
Rae_Bradbury_Interview_2.txt
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alar

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

JOBLIB_DIR = Path.cwd() / "aajonus_joblibs"
JOBLIB_DIR.mkdir(exist_ok=True)

vectorizer_path = JOBLIB_DIR / 'tfidf_vectorizer.joblib'
matrix_path = JOBLIB_DIR / 'tfidf_matrix.joblib'

max_df = 0.7
min_df = 0.00
ngram_range = (1, 1)

def custom_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

# Check if parameters have changed and files exist
params_changed = False
if vectorizer_path.exists():
    existing_vectorizer = joblib.load(vectorizer_path)
    if (existing_vectorizer.max_df != max_df or 
        existing_vectorizer.min_df != min_df or 
        existing_vectorizer.ngram_range != ngram_range):
        params_changed = True
        os.remove(vectorizer_path)
        os.remove(matrix_path)

if not matrix_path.exists() or params_changed:
    print("Fitting TF-IDF vectorizer to the dataset...")
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_df=max_df, min_df=min_df, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(df['sentence'])
    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(tfidf_matrix, matrix_path)
else:
    print("Loading fitted TF-IDF vectorizer and matrix dataset...")
    vectorizer = joblib.load(vectorizer_path)
    tfidf_matrix = joblib.load(matrix_path)

Fitting TF-IDF vectorizer to the dataset...




In [54]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    query_vector = vectorizer.transform([query])  # Preprocessing is handled by vectorizer
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    top_indices = similarities.argsort()[0][-3:]

    # Retrieve the corresponding rows from the DataFrame
    top_docs = df.iloc[top_indices]
    top_scores = similarities[0][top_indices]

    return top_docs, top_scores

In [68]:
import time

test_set_columns=["Query", "Result", "Cosine", "Relevance Score", "Filename", "Date", "Max DF", "Min DF", "Ngram Range"]

def search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, test_set):
    top_docs, top_scores = search(query, vectorizer, tfidf_matrix, df)
    
    if top_docs.empty:
        print("No documents found for this query.")
        return test_set

    new_rows = []
    for (index, row), score in zip(top_docs.iterrows(), top_scores):
        new_row = {
            "Query": query,
            "Result": row['sentence'],
            "Cosine": score,
            "Filename": row['filename'],
            "Relevance Score": 0,  # Placeholder for manual scoring
            "Date": pd.Timestamp('now'),
            "Max DF": max_df,
            "Min DF": min_df,
            "Ngram Range": ngram_range
        }
        new_rows.append(new_row)
    
    # Create a DataFrame from the new_rows list
    new_rows_df = pd.DataFrame(new_rows)
    
    # Append the new_rows_df to the test_set
    test_set = pd.concat([test_set, new_rows_df], ignore_index=True)
    
    return test_set

In [69]:
def generate_test_set_from_queries(query_file_path, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range):
    test_set = pd.DataFrame(columns=test_set_columns)
    with open(query_file_path, 'r') as file:
        queries = file.read().splitlines()
    
    for query in queries:
        test_set = search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, test_set)
    
    return test_set

TEST_SET_DIR = Path.cwd() / "aajonus_test_sets"
TEST_SET_DIR.mkdir(exist_ok=True)
test_set_path = TEST_SET_DIR / "test_set_v1.csv"
query_file_path = TEST_SET_DIR / "queries.txt"

test_set = generate_test_set_from_queries(query_file_path, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range)

test_set.to_csv(test_set_path, index=False)

Read 36 queries from file.
Processing query: Is salt unhealthy
Processing query: Is salt unhealthy
New rows DataFrame for query 'Is salt unhealthy':
               Query                    Result    Cosine  \
0  Is salt unhealthy          A: Salt is salt.  0.549915   
1  Is salt unhealthy   All appeared unhealthy.  0.569086   
2  Is salt unhealthy  Is it unhealthy to take?  0.679475   

                                Filename  Relevance Score  \
0          Q&A_Of_September_26,_2010.txt                0   
1  Mercury_In_Fish;_Do_We_Absorb_It?.txt                0   
2               Question_And_Answers.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:23.995510     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:23.995628     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:23.995724     0.7     0.0      (1, 1)  
Updated test set after processing query 'Is salt unhealthy':
               Query                    Result    Cosine Relevance S

New rows DataFrame for query 'What are signs of intelligence':
                            Query  \
0  What are signs of intelligence   
1  What are signs of intelligence   
2  What are signs of intelligence   

                                              Result    Cosine  \
0  He always has to use other elements of nature ...  0.457164   
1  Dietary patterns and intelligence\nin early an...  0.501877   
2           Cynicism is not a sign of\nintelligence.  0.731973   

                      Filename  Relevance Score                       Date  \
0       Q&A_Of_May_7,_2006.txt                0 2023-11-28 12:42:24.167431   
1  Eating_Out,_Is_It_Safe?.txt                0 2023-11-28 12:42:24.167548   
2  Q&A_Of_November_7,_1999.txt                0 2023-11-28 12:42:24.167645   

   Max DF  Min DF Ngram Range  
0     0.7     0.0      (1, 1)  
1     0.7     0.0      (1, 1)  
2     0.7     0.0      (1, 1)  
Updated test set after processing query 'What are signs of intelligence':
        

New rows DataFrame for query 'Mercury naturally occurring in fish':
                                 Query    Result    Cosine  \
0  Mercury naturally occurring in fish  Mercury.  0.448403   
1  Mercury naturally occurring in fish  Mercury.  0.448403   
2  Mercury naturally occurring in fish  Mercury.  0.448403   

                            Filename  Relevance Score  \
0  Primal_Diet_Workshop_(Part_1).txt                0   
1  Primal_Diet_Workshop_(Part_1).txt                0   
2           Q&A_Of_June_16,_2013.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:24.414996     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:24.415114     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:24.415211     0.7     0.0      (1, 1)  
Updated test set after processing query 'Mercury naturally occurring in fish':
                                  Query  \
0                     Is salt unhealthy   
1                     Is salt unhealthy   
2         

New rows DataFrame for query 'Hot bath temperatures':
                   Query           Result    Cosine  \
0  Hot bath temperatures  Hot, hot baths.  0.731974   
1  Hot bath temperatures       Hot baths.  0.782285   
2  Hot bath temperatures       Hot baths.  0.782285   

                       Filename  Relevance Score                       Date  \
0      Q&A_Of_July_10,_2011.txt                0 2023-11-28 12:42:24.677086   
1     Beneficial_Home_Baths.txt                0 2023-11-28 12:42:24.677201   
2  Q&A_Of_February_22,_2009.txt                0 2023-11-28 12:42:24.677299   

   Max DF  Min DF Ngram Range  
0     0.7     0.0      (1, 1)  
1     0.7     0.0      (1, 1)  
2     0.7     0.0      (1, 1)  
Updated test set after processing query 'Hot bath temperatures':
                                       Query  \
0                          Is salt unhealthy   
1                          Is salt unhealthy   
2                          Is salt unhealthy   
3                      

New rows DataFrame for query 'What is the best food':
                   Query                                             Result  \
0  What is the best food  So I'm asking for\nhim, what would be the best...   
1  What is the best food                                What is best to do?   
2  What is the best food                                         What food?   

     Cosine                     Filename  Relevance Score  \
0  0.620695  Q&A_Of_February_3,_2013.txt                0   
1  0.634047     Question_And_Answers.txt                0   
2  0.647610     Question_And_Answers.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:24.912033     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:24.912141     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:24.912231     0.7     0.0      (1, 1)  
Updated test set after processing query 'What is the best food':
                                       Query  \
0                          Is salt 

New rows DataFrame for query 'What does high meat do to the body':
                                Query  \
0  What does high meat do to the body   
1  What does high meat do to the body   
2  What does high meat do to the body   

                                              Result    Cosine  \
0                                  What did they do.  0.584407   
1                   That's what it does in the body.  0.589716   
2  That's why when you eat high meat, does everyb...  0.595627   

                                       Filename  Relevance Score  \
0                      Q&A_Of_July_24,_2005.txt                0   
1                  Q&A_Of_November_14,_2004.txt                0   
2  Q&A_Of_June_10,_2007_&_September_9,_2007.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:25.168480     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:25.168593     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:25.168686     0.7     0.0      (1,

New rows DataFrame for query 'Cleaning formula':
              Query             Result    Cosine  \
0  Cleaning formula    of the formula?  0.557777   
1  Cleaning formula  It's not cleaned.  0.596392   
2  Cleaning formula   in the formulas.  0.610974   

                                       Filename  Relevance Score  \
0                     Q&A_Of_March_26,_2000.txt                0   
1  Q&A_Of_June_10,_2007_&_September_9,_2007.txt                0   
2                      Question_And_Answers.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:25.424001     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:25.424112     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:25.424204     0.7     0.0      (1, 1)  
Updated test set after processing query 'Cleaning formula':
                                         Query  \
0                            Is salt unhealthy   
1                            Is salt unhealthy   
2                      

New rows DataFrame for query 'Does the lubrication formula need to be heated':
                                            Query  \
0  Does the lubrication formula need to be heated   
1  Does the lubrication formula need to be heated   
2  Does the lubrication formula need to be heated   

                                   Result    Cosine                  Filename  \
0       The lubrication formula and fish.  0.625475  Q&A_Of_April_6,_2008.txt   
1  Lubrication formula is the best thing.  0.627190   Q&A_Of_July_8,_2001.txt   
2  So, you need that lubrication formula.  0.740771    Q&A_Of_May_7,_2006.txt   

   Relevance Score                       Date  Max DF  Min DF Ngram Range  
0                0 2023-11-28 12:42:25.667499     0.7     0.0      (1, 1)  
1                0 2023-11-28 12:42:25.667624     0.7     0.0      (1, 1)  
2                0 2023-11-28 12:42:25.667722     0.7     0.0      (1, 1)  
Updated test set after processing query 'Does the lubrication formula need to b

New rows DataFrame for query 'Should I get amalgam fillings removed':
                                   Query  \
0  Should I get amalgam fillings removed   
1  Should I get amalgam fillings removed   
2  Should I get amalgam fillings removed   

                                              Result    Cosine  \
0  The vapor will crystal, say like you had mercu...  0.565650   
1            Q: What about mercury amalgam fillings?  0.602370   
2           It's in mouths within\namalgam fillings.  0.607736   

                                Filename  Relevance Score  \
0              Q&A_Of_April_14,_2002.txt                0   
1          Q&A_Of_September_13,_2009.txt                0   
2  How_Toxic_is_Our_Civilized_World?.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:25.914584     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:25.914704     0.7     0.0      (1, 1)  
2 2023-11-28 12:42:25.914802     0.7     0.0      (1, 1)  
Updated

New rows DataFrame for query 'what % of each vegetable is the juice made of':
                                           Query  \
0  what % of each vegetable is the juice made of   
1  what % of each vegetable is the juice made of   
2  what % of each vegetable is the juice made of   

                                              Result    Cosine  \
0  May be 60% of the meat\nand less than that of ...  0.497506   
1                                   Vegetable juice.  0.542953   
2                                           of each.  0.552941   

                                        Filename  Relevance Score  \
0                    Q&A_Of_January_27,_2013.txt                0   
1  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt                0   
2                        Q&A_Of_May_29,_2011.txt                0   

                        Date  Max DF  Min DF Ngram Range  
0 2023-11-28 12:42:26.158663     0.7     0.0      (1, 1)  
1 2023-11-28 12:42:26.158780     0.7     0.0      (1,

In [70]:
# After creating the test_set and evaluating the relevance of the top 5, we now need to compute some metrics. 

from sklearn.metrics import precision_score, recall_score, f1_score

def compute_evaluations(test_set, threshold=0.5):
    evaluation_data = []
    
    # This loops over each unique column name
    for query in test_set['Query'].unique():
        # Then we create a new df which filters the test_set on the query col
        # This df will be 5 rows
        current_query_data = test_set[test_set['Query'] == query]
        
        # We then extract out the columns and create numpy arrays from the respective values 
        true_relevance = current_query_data['Relevance Score'].to_numpy()
        cosine_scores = current_query_data['Cosine'].to_numpy()
        
        # Convert cosine scores to binary predictions
        predicted_relevance = (cosine_scores >= threshold).astype(int)
        
        # print(f"True Relevance: {true_relevance}")
        # print(f"Cosine scores: {cosine_scores}")
        # print(f"Predicted Relevance: {predicted_relevance}")
        
        # Calculate precision, recall, and F1-score with zero_division parameter
        # Precison: True Pos / (True Pos + False Pos)
        # Recall: True Pos / (True Pos + False Neg)
        # Harmonic mean: 2 * (Prec x Rec / (Prec + Rec))
        precision = precision_score(true_relevance, predicted_relevance, zero_division=0)
        recall = recall_score(true_relevance, predicted_relevance, zero_division=0)
        f1 = f1_score(true_relevance, predicted_relevance, zero_division=0)
        
        # print(f"Precision: {precision}")
        # print(f"Recall: {recall}")
        # print(f"F1: {f1}")

        evaluation_data.append({
            'Query': query,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })
    
    eval_df = pd.DataFrame(evaluation_data)
    
    return eval_df

EVAL_DIR = Path.cwd() / "aajonus_evaluations"
EVAL_DIR.mkdir(exist_ok=True)

test_set_path = TEST_SET_DIR / "test_set_v1_scored.csv"
test_set = pd.read_csv(test_set_path)

eval_df = compute_evaluations(test_set)
eval_path = EVAL_DIR / "evaluation_v1.csv"
eval_df.to_csv(eval_path, index=False)

print(eval_df.head())

                            Query  Precision  Recall  F1-Score
0               Is salt unhealthy        0.0     0.0  0.000000
1              Salt damages cells        0.0     0.0  0.000000
2               Why is salt toxic        0.0     0.0  0.000000
3  What are signs of intelligence        0.5     1.0  0.666667
4         Memory and intelligence        0.5     1.0  0.666667
