In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x16042d510>

In [3]:
DATA_DIR = Path.cwd() / "aajonus_data"

# Initialize a list to store your data
data = []

# Initialize a list to store your data
data = []

file_count = 0

for filename in os.listdir(DATA_DIR):
    # Only process the first 20 files
    if file_count >= 50:
        break

    if filename.endswith(".txt"):
        file_count += 1

        print(filename)

        # Create the full filepath
        file_path = os.path.join(DATA_DIR, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            # Use spaCy to tokenize the content into sentences
            doc = nlp(content)
            sentences = [sent.text.strip() for sent in doc.sents]
            # Append each sentence to your data list, along with the filename
            for sentence in sentences:
                data.append({"filename": filename, "sentence": sentence})

# Create a DataFrame
df = pd.DataFrame(data)

# Print the first 10 rows of the DataFrame
print(df.head(10))

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

In [4]:
def spacy_lemmatize(text):
    text = text.lower()

    doc = nlp(text)
    
    lemmas = [token.lemma_ for token in doc]
    
    return ' '.join(lemmas)

In [5]:
df['expanded_lemmatized_text'] = df['sentence'].apply(spacy_lemmatize)
print(df.head(10))

                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   
5  Needles_Of_Disease_and_Death_Continue_In_The_N...   
6  Needles_Of_Disease_and_Death_Continue_In_The_N...   
7  Needles_Of_Disease_and_Death_Continue_In_The_N...   
8  Needles_Of_Disease_and_Death_Continue_In_The_N...   
9  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  \
0  On Halloween, I received the most alarming ter...   
1  I received it\nin a letter from Care2 organiza...   
2  Most of us have\nnever witnessed the crippling...   
3  Polio is still endemic in three of the world's...   
4  This is the scary truth: levels of polio are a...   
5  Eradication is\nwithin reach, but we need yo

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text
tfidf_matrix = vectorizer.fit_transform(df['expanded_lemmatized_text'])

In [7]:
import joblib

joblib_dir = Path.cwd() / "aajonus_joblibs"
joblib_dir.mkdir(exist_ok=True)

# Define the full path for the vectorizer and matrix joblib files
vectorizer_path = joblib_dir / 'tfidf_vectorizer.joblib'
matrix_path = joblib_dir / 'tfidf_matrix.joblib'

# Save the vectorizer and matrix to disk in the specified directory
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(tfidf_matrix, matrix_path)

['/Users/ethancavill/Documents/notebooks/aajonus_joblibs/tfidf_matrix.joblib']

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    # Preprocess the query
    preprocessed_query = spacy_lemmatize(query)
    
    # Vectorize the query
    query_vector = vectorizer.transform([preprocessed_query])
    
    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    # Get the top 5 most similar document indices
    top_indices = similarities.argsort()[0][-10:]
    
    # Return the most similar documents and their similarity scores
    return df.iloc[top_indices], similarities[0][top_indices]

# Test the search with an example query
example_query = "high meat"
top_docs, scores = search(example_query, vectorizer, tfidf_matrix, df)
print(top_docs)
print(scores)

                                            filename  \
13688                        Q&A_Of_May_26,_2013.txt   
5294   Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
5770   Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
9111                     Q&A_Of_October_14,_2012.txt   
14971              Primal_Diet_Workshop_(Part_1).txt   
16618                  Q&A_Of_September_10,_2006.txt   
8356                    Q&A_Of_December_14,_2008.txt   
9529                     Q&A_Of_October_14,_2012.txt   
3691   Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   
5293   Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt   

                                                sentence  \
13688  I've got people who only do high meat,\nonly d...   
5294   High meats, I would say, not for about 2.5 years.   
5770        I haven't really done much with high\nmeats.   
9111                 G: How do you make high meat juice?   
14971  Actually, Eskimos gave me the introduction, bu...   
16618  (49) High Meat V

In [None]:
import pandas as pd

# Updated DataFrame with additional columns
hyperparameter_results = pd.DataFrame(columns=[
    "Date", 
    "Dataset Characteristics", 
    "Max DF", 
    "Min DF", 
    "Ngram Range", 
    "Precision", 
    "Recall", 
    "F1-Score", 
    "Execution Time", 
    "Threshold", 
    "Comments", 
    "Example Queries & Results"
])

In [None]:
# Example of how to add data to the DataFrame
# hyperparameter_results.loc[len(hyperparameter_results)] = ["2023-05-01", "Technical domain texts", 0.95, 0.01, (1,2), 0.8, 0.7, 0.77, "30s", 0.5, "First trial run", "query1 -> Doc A, B, C"]

# Saving the DataFrame as a CSV file
hyperparameter_results.to_csv("hyperparameter_results.csv", index=False)