In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x16a6fead0>

In [8]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

full_df_path = DF_DIR / "full_dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if full_df_path.exists():
    print("Loading full dataset from CSV...")
    df = pd.read_csv(full_df_path)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the full filepath
            file_path = os.path.join(DATA_DIR, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(full_df_path, index=False)

print(df.head())

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

With_Mercury_Found_In_Wild_Animals,_Do_We_Need_To_Be_Extra_Careful?.txt
Q&A_Of_September_26,_2010.txt
Do_You_Buy_Chicken_While_Traveling?.txt
Cancer_Convention_September_2000.txt
Q&A_Of_November_7,_1999.txt
Q&A_Of_November_26,_2006.txt
How_Bad_Are_MRIs?.txt
Arsenic_In_Poultry_Meat_And_Eggs.txt
Joanne_Unleahsed_Interview.txt
Declaring_Our_Rights_To_Our_Body.txt
We_Want_To_Live.txt
Soy_Toxicity_In_Poultry_Meat_And_Eggs.txt
Hot_Tub_Therapy.txt
Bacteria_and_Other_Microbes_Are_Responsible_for_Vibrant_Health.txt
Gum_And_Tooth_Disease.txt
Rae_Bradbury_Interview_2.txt
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alar

In [9]:
test_set_queries_path = Path.cwd() / "aajonus_test_set_data" / "aajonus_test_set_data.csv"
test_df_path = DF_DIR / "test_dataframe.csv"

labelled_queries_df = pd.read_csv(test_set_queries_path)

if test_df_path.exists():
    print("Loading test dataset from CSV...")
    test_set_df = pd.read_csv(test_df_path)
else:
    unique_filenames = labelled_queries_df['Filename'].unique()
    # Create a copy of the filtered DataFrame to ensure it's independent
    test_set_df = df[df['filename'].isin(unique_filenames)].copy()

    test_set_df.to_csv(test_df_path, index=False)

print(test_set_df.head())

                                           filename  \
2005  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2006  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2007  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2008  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2009  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   

                                               sentence  
2005  @Source\n\nTranscriber: Michael - Thank you, M...  
2006  Primal Diet Workshop in Nevada City, Californi...  
2007  He's here to talk to us\nabout raw food, about...  
2008  Aajonus came into our\nlives a couple of years...  
2009                                  Thank you Jill. [  


In [10]:
def spacy_lemmatize(text):
    text = text.lower()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [11]:
PROCESSED_DF_DIR = Path.cwd() / "processed_data"
PROCESSED_DF_DIR.mkdir(exist_ok=True)

full_processed_df_path = PROCESSED_DF_DIR / "full_dataset_preprocessed.csv"
test_processed_df_path = PROCESSED_DF_DIR / "test_set_preprocessed.csv"

# Check and preprocess full dataset
if full_processed_df_path.exists():
    print("Loading preprocessed full dataset from CSV...")
    full_df = pd.read_csv(full_processed_df_path)
else:
    df['expanded_lemmatized_text'] = df['sentence'].apply(spacy_lemmatize)
    df.to_csv(full_processed_df_path, index=False)
    full_df = df

# Check and preprocess test dataset
if test_processed_df_path.exists():
    print("Loading preprocessed test dataset from CSV...")
    test_set_df = pd.read_csv(test_processed_df_path)
else:
    test_set_df['expanded_lemmatized_text'] = test_set_df['sentence'].apply(spacy_lemmatize)
    test_set_df.to_csv(test_processed_df_path, index=False)

print(full_df.head(10))
print(test_set_df.head(10))

                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   
5  Needles_Of_Disease_and_Death_Continue_In_The_N...   
6  Needles_Of_Disease_and_Death_Continue_In_The_N...   
7  Needles_Of_Disease_and_Death_Continue_In_The_N...   
8  Needles_Of_Disease_and_Death_Continue_In_The_N...   
9  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  \
0  On Halloween, I received the most alarming ter...   
1  I received it\nin a letter from Care2 organiza...   
2  Most of us have\nnever witnessed the crippling...   
3  Polio is still endemic in three of the world's...   
4  This is the scary truth: levels of polio are a...   
5  Eradication is\nwithin reach, but we need yo

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

JOBLIB_DIR = Path.cwd() / "aajonus_joblibs"
JOBLIB_DIR.mkdir(exist_ok=True)

vectorizer_path = JOBLIB_DIR / 'tfidf_vectorizer.joblib'
full_matrix_path = JOBLIB_DIR / 'tfidf_full_matrix.joblib'
test_matrix_path = JOBLIB_DIR / 'tfidf_test_matrix.joblib'

max_df = 0.4
min_df = 0.00
ngram_range = (1, 4)


# Check if parameters have changed and files exist
params_changed = False
if vectorizer_path.exists():
    existing_vectorizer = joblib.load(vectorizer_path)
    if (existing_vectorizer.max_df != max_df or 
        existing_vectorizer.min_df != min_df or 
        existing_vectorizer.ngram_range != ngram_range):
        params_changed = True
        os.remove(vectorizer_path)
        os.remove(full_matrix_path)
        if test_matrix_path.exists():
            os.remove(test_matrix_path)

# Check if the TF-IDF matrix for the full dataset already exists
if not full_matrix_path.exists() or params_changed:
    print("Fitting TF-IDF vectorizer to the full dataset...")
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range)
    tfidf_full_matrix = vectorizer.fit_transform(full_df['expanded_lemmatized_text'])
    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(tfidf_full_matrix, full_matrix_path)
else:
    print("Loading fitted TF-IDF vectorizer and matrix for full dataset...")
    vectorizer = joblib.load(vectorizer_path)
    tfidf_full_matrix = joblib.load(full_matrix_path)

# Process the test dataset
if not test_matrix_path.exists() or params_changed:
    print("Transforming test dataset using fitted vectorizer...")
    tfidf_test_matrix = vectorizer.transform(test_set_df['expanded_lemmatized_text'])
    joblib.dump(tfidf_test_matrix, test_matrix_path)
else:
    print("Loading TF-IDF matrix for test dataset...")
    tfidf_test_matrix = joblib.load(test_matrix_path)

Fitting TF-IDF vectorizer to the full dataset...
Transforming test dataset using fitted vectorizer...


In [34]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    preprocessed_query = spacy_lemmatize(query)
    
    query_vector = vectorizer.transform([preprocessed_query])
    
    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    # Get the top 10 most similar document indices
    top_indices = similarities.argsort()[0][-10:]
    
    # Return the most similar documents and their similarity scores
    return df.iloc[top_indices], similarities[0][top_indices]

In [35]:
import time

total_execution_time = 0

# Check if the expected sentence is among the retrieved sentences
def is_relevant(retrieved_docs, relevant_doc):
    return relevant_doc in retrieved_docs

total_precision = total_recall = 0
num_queries = len(labelled_queries_df)

for index, row in labelled_queries_df.iterrows():
    query = row['Relevant Query']
    expected_sentence = row['Sentence']

    start_time = time.time()  
    top_docs, _ = search(query, vectorizer, tfidf_test_matrix, test_set_df)
    execution_time = time.time() - start_time  
    total_execution_time += execution_time
    
    retrieved_sentences = top_docs['sentence'].tolist()

    relevant = is_relevant(retrieved_sentences, expected_sentence)

    precision = 1 if relevant else 0
    recall = 1 if relevant else 0

    total_precision += precision
    total_recall += recall
    
    # Print statements for debugging
    print(f"Query: {query}")
    print(f"Expected Sentence: {expected_sentence}")
    print(f"Retrieved Sentences: {retrieved_sentences[:5]}")  # Print first 5 retrieved sentences
    print(f"Is Relevant: {relevant}")
    print(f"Query Precision: {precision}, Query Recall: {recall}")
    print(f"Query Execution Time: {execution_time}s\n")

average_precision = total_precision / num_queries
average_recall = total_recall / num_queries
average_f1_score = 2 * (average_precision * average_recall) / (average_precision + average_recall) if (average_precision + average_recall) != 0 else 0

print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1_score}")

Query: Genius children
Expected Sentence: All hyperactive children have potential genius, but unless they can utilize proteins or harness adrenaline, their genius may turn into antisocial behavior.
Retrieved Sentences: ['I had jaundice when I was a child.', "A: So there you've got a difficult child, you know that child's not going to be\nmindful.", 'I looked like a mad child.', 'They have child after child simply because they want to.', 'The\nchild repaired faster than any other child.']
Is Relevant: False
Query Precision: 0, Query Recall: 0
Query Execution Time: 0.0794973373413086s

Query: Lubrication formula recipe
Expected Sentence: 1 to 2 raw eggs 2 to 4 ounces unsalted raw butter or coconut cream 1 to 2 tablespoons lemon juice 1 to 2 teaspoons unheated honey
Retrieved Sentences: ["See acid/alkaline balance\nAmbrosia Coconut Cream Pie (recipe), 134–135\nAmbrosia Cream Pie (recipe), 136–137\namino acids, destruction of, 157\nanaphylaxis, 174\nanimals\nE. coli consumption by, 175\nef

Query: Genius children
Expected Sentence: In the children who do not get vaccinations, you got three geniuses out of every ten.
Retrieved Sentences: ['I had jaundice when I was a child.', "A: So there you've got a difficult child, you know that child's not going to be\nmindful.", 'I looked like a mad child.', 'They have child after child simply because they want to.', 'The\nchild repaired faster than any other child.']
Is Relevant: False
Query Precision: 0, Query Recall: 0
Query Execution Time: 0.0712425708770752s

Query: Vegetable juice recipe
Expected Sentence: I'm going to recommend that you have 10% carrot juice, 80% celery, and 10% parsley
Retrieved Sentences: ['Before or after vegetable juice.', "See nuts and seeds\nSexy Chicken (recipe), 102\nShrimp Passion (r ecipe), 110\nskin care, 145–146\nsleep and healing, 24\nsolvents\nfor cleansing, 31\nstored fats as, 38\nsoup recipes, 113–116\nChicken, 114\nChicken & Tomato, 113\nCr eam of Chicken, 114\nGrandma's Tomato, 115\nLentil, 11

In [32]:
PARAM_DIR = Path.cwd() / "aajonus_hyperparameter_table"
PARAM_DIR.mkdir(exist_ok=True)

def get_next_table_index(directory):
    existing_files = [f for f in directory.iterdir() if f.is_file()]
    return len(existing_files) + 1

table_index = get_next_table_index(PARAM_DIR)
full_param_path = PARAM_DIR / f"hyperparameter_table_{table_index}.csv"

# Add results to the hyperparameter table
hyperparameter_results = pd.DataFrame(columns=[
    "Date", 
    "Dataset Version", 
    "Max DF", 
    "Min DF", 
    "Ngram Range", 
    "Precision", 
    "Recall", 
    "F1-Score", 
    "Total Execution Time", 
    "Comments", 
])

hyperparameter_results.loc[len(hyperparameter_results)] = [
    pd.Timestamp('now'), 
    "v1",  
    max_df, 
    min_df, 
    ngram_range, 
    average_precision,  
    average_recall,     
    average_f1_score,   
    total_execution_time, 
    "Another good result, cant tell much difference from before",    
]

hyperparameter_results.to_csv(full_param_path, index=False)

print(hyperparameter_results.head())

                        Date Dataset Version  Max DF  Min DF Ngram Range  \
0 2023-11-18 15:16:34.653649              v1     0.4     0.0      (1, 1)   

   Precision  Recall  F1-Score  Total Execution Time  \
0        0.0     0.0         0              0.116675   

                                            Comments  
0  Another good result, cant tell much difference...  
