In [2]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x168b92c10>

In [17]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

full_df = DF_DIR / "full_dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if full_df.exists():
    print("Loading full dataset from CSV...")
    df = pd.read_csv(full_df)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the full filepath
            file_path = os.path.join(DATA_DIR, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(full_df, index=False)

print(df.head())

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

With_Mercury_Found_In_Wild_Animals,_Do_We_Need_To_Be_Extra_Careful?.txt
Q&A_Of_September_26,_2010.txt
Do_You_Buy_Chicken_While_Traveling?.txt
Cancer_Convention_September_2000.txt
Q&A_Of_November_7,_1999.txt
Q&A_Of_November_26,_2006.txt
How_Bad_Are_MRIs?.txt
Arsenic_In_Poultry_Meat_And_Eggs.txt
Joanne_Unleahsed_Interview.txt
Declaring_Our_Rights_To_Our_Body.txt
We_Want_To_Live.txt
Soy_Toxicity_In_Poultry_Meat_And_Eggs.txt
Hot_Tub_Therapy.txt
Bacteria_and_Other_Microbes_Are_Responsible_for_Vibrant_Health.txt
Gum_And_Tooth_Disease.txt
Rae_Bradbury_Interview_2.txt
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alar

In [18]:
test_set_csv = Path.cwd() / "aajonus_test_set_data" / "aajonus_test_set_data.csv"
test_df = DF_DIR / "test_dataframe.csv"

if test_df.exists():
    print("Loading test dataset from CSV...")
    test_set_df = pd.read_csv(test_df)
else:
    test_set_df = pd.read_csv(test_set_csv)

    unique_filenames = test_set_df['Filename'].unique()
    test_set_df = df[df['filename'].isin(unique_filenames)]

    test_set_df.to_csv(test_df, index=False)

print(test_set_df.head())

                                           filename  \
2005  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2006  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2007  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2008  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
2009  Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   

                                               sentence  
2005  @Source\n\nTranscriber: Michael - Thank you, M...  
2006  Primal Diet Workshop in Nevada City, Californi...  
2007  He's here to talk to us\nabout raw food, about...  
2008  Aajonus came into our\nlives a couple of years...  
2009                                  Thank you Jill. [  


In [19]:
def spacy_lemmatize(text):
    text = text.lower()

    doc = nlp(text)
    
    lemmas = [token.lemma_ for token in doc]
    
    return ' '.join(lemmas)

In [None]:
df['expanded_lemmatized_text'] = df['sentence'].apply(spacy_lemmatize)
print(df.head(10))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text
tfidf_matrix = vectorizer.fit_transform(df['expanded_lemmatized_text'])

In [None]:
import joblib

joblib_dir = Path.cwd() / "aajonus_joblibs"
joblib_dir.mkdir(exist_ok=True)

# Define the full path for the vectorizer and matrix joblib files
vectorizer_path = joblib_dir / 'tfidf_vectorizer.joblib'
matrix_path = joblib_dir / 'tfidf_matrix.joblib'

# Save the vectorizer and matrix to disk in the specified directory
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(tfidf_matrix, matrix_path)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    # Preprocess the query
    preprocessed_query = spacy_lemmatize(query)
    
    # Vectorize the query
    query_vector = vectorizer.transform([preprocessed_query])
    
    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    # Get the top 5 most similar document indices
    top_indices = similarities.argsort()[0][-10:]
    
    # Return the most similar documents and their similarity scores
    return df.iloc[top_indices], similarities[0][top_indices]

# Test the search with an example query
example_query = "high meat"
top_docs, scores = search(example_query, vectorizer, tfidf_matrix, df)
print(top_docs)
print(scores)

In [None]:
import pandas as pd

# Updated DataFrame with additional columns
hyperparameter_results = pd.DataFrame(columns=[
    "Date", 
    "Dataset Characteristics", 
    "Max DF", 
    "Min DF", 
    "Ngram Range", 
    "Precision", 
    "Recall", 
    "F1-Score", 
    "Execution Time", 
    "Threshold", 
    "Comments", 
    "Example Queries & Results"
])

In [None]:
# Example of how to add data to the DataFrame
# hyperparameter_results.loc[len(hyperparameter_results)] = ["2023-05-01", "Technical domain texts", 0.95, 0.01, (1,2), 0.8, 0.7, 0.77, "30s", 0.5, "First trial run", "query1 -> Doc A, B, C"]

# Saving the DataFrame as a CSV file
hyperparameter_results.to_csv("hyperparameter_results.csv", index=False)