In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x165ac4b10>

In [3]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

df_path = DF_DIR / "dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if df_path.exists():
    print("Loading dataset from CSV...")
    df = pd.read_csv(df_path)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the filepath
            file_path = DATA_DIR / filename
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(df_path, index=False)

print(df.head())

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

With_Mercury_Found_In_Wild_Animals,_Do_We_Need_To_Be_Extra_Careful?.txt
Q&A_Of_September_26,_2010.txt
Do_You_Buy_Chicken_While_Traveling?.txt
Cancer_Convention_September_2000.txt
Q&A_Of_November_7,_1999.txt
Q&A_Of_November_26,_2006.txt
How_Bad_Are_MRIs?.txt
Arsenic_In_Poultry_Meat_And_Eggs.txt
Joanne_Unleahsed_Interview.txt
Declaring_Our_Rights_To_Our_Body.txt
We_Want_To_Live.txt
Soy_Toxicity_In_Poultry_Meat_And_Eggs.txt
Hot_Tub_Therapy.txt
Bacteria_and_Other_Microbes_Are_Responsible_for_Vibrant_Health.txt
Gum_And_Tooth_Disease.txt
Rae_Bradbury_Interview_2.txt
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alar

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

JOBLIB_DIR = Path.cwd() / "aajonus_joblibs"
JOBLIB_DIR.mkdir(exist_ok=True)

vectorizer_path = JOBLIB_DIR / 'tfidf_vectorizer.joblib'
matrix_path = JOBLIB_DIR / 'tfidf_matrix.joblib'

max_df = 0.7
min_df = 0.00
ngram_range = (1, 1)

def custom_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

# Check if parameters have changed and files exist
params_changed = False
if vectorizer_path.exists():
    existing_vectorizer = joblib.load(vectorizer_path)
    if (existing_vectorizer.max_df != max_df or 
        existing_vectorizer.min_df != min_df or 
        existing_vectorizer.ngram_range != ngram_range):
        params_changed = True
        os.remove(vectorizer_path)
        os.remove(matrix_path)

if not matrix_path.exists() or params_changed:
    print("Fitting TF-IDF vectorizer to the dataset...")
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_df=max_df, min_df=min_df, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(df['sentence'])
    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(tfidf_matrix, matrix_path)
else:
    print("Loading fitted TF-IDF vectorizer and matrix dataset...")
    vectorizer = joblib.load(vectorizer_path)
    tfidf_matrix = joblib.load(matrix_path)

Fitting TF-IDF vectorizer to the dataset...


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    query_vector = vectorizer.transform([query])  # Preprocessing is handled by vectorizer
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    top_indices = similarities.argsort()[0][-5:]

    # Retrieve the corresponding rows from the DataFrame
    top_docs = df.iloc[top_indices]
    top_scores = similarities[0][top_indices]

    return top_docs, top_scores

In [65]:
import time

columns=["Query", "Result", "Cosine", "Relevance Score", "Filename", "Date", "Max DF", "Min DF", "Ngram Range"]

def search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, relevance_feedback):
    start_time = time.time()
    top_docs, top_scores = search(query, vectorizer, tfidf_matrix, df)
    execution_time = time.time() - start_time
    
    new_rows = []
    for index, score in zip(top_docs.index, top_scores):
        print(f"\n'{query}': '{top_docs.loc[index]['sentence']}', [{score}]")
        row = top_docs.loc[index]
        new_rows.append({
            "Query": query,
            "Result": row['sentence'],
            "Cosine": score,
            "Filename": row['filename'],
            "Relevance Score": None,
            "Date": pd.Timestamp('now'),
            "Max DF": max_df,
            "Min DF": min_df,
            "Ngram Range": ngram_range
        })

    relevance_feedback = pd.concat([relevance_feedback, pd.DataFrame(new_rows)], ignore_index=True)
    return relevance_feedback

In [66]:
FEEDBACK_DIR = Path.cwd() / "aajonus_feedback"
FEEDBACK_DIR.mkdir(exist_ok=True)

def get_feedback_df(feedback_path):
    if feedback_path.exists():
        df = pd.read_csv(feedback_path)
        df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime
        return df
    else:
        return pd.DataFrame(columns=columns)

feedback_path = FEEDBACK_DIR / "relevance_feedback.csv"
relevance_feedback = get_feedback_df(feedback_path)

query = "Child genius"
relevance_feedback = search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, relevance_feedback)

# Save updated DataFrame
relevance_feedback.to_csv(feedback_path, index=False)


'Child genius': 'Child.,', [0.4977997821650079]

'Child genius': 'Your body is always doing the best it can and
it’s a genius.', [0.526598897141908]

'Child genius': 'In the children
who do not get vaccinations, you got three geniuses out of every ten.', [0.5611802814661668]

'Child genius': 'All hyperactive children have potential genius, but unless they can utilize proteins or harness adrenaline, their genius may turn into antisocial behavior.', [0.588753449079798]

'Child genius': 'The child is absolutely a genius.', [0.8416843240422897]
