In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import os
from pathlib import Path
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

spacy.cli.download("en_core_web_sm")
# python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x164e69f10>

In [6]:
DATA_DIR = Path.cwd() / "aajonus_data"

DF_DIR = Path.cwd() / "aajonus_saved_dfs"
DF_DIR.mkdir(exist_ok=True)

df_path = DF_DIR / "dataframe.csv"

# Conditional that checks whether we saved the dfs as csv files
# If yes, then reinitialise these as dfs
# If not, then create the dfs and save them in csv format for next run
if df_path.exists():
    print("Loading dataset from CSV...")
    df = pd.read_csv(df_path)
else:
    data = []

    for filename in os.listdir(DATA_DIR):
        if filename.endswith(".txt"):
            print(filename)

            # Create the filepath
            file_path = DATA_DIR / filename
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                # Use spaCy to tokenize the content into sentences
                doc = nlp(content)
                sentences = [sent.text.strip() for sent in doc.sents]
                # Append each sentence to your data list, along with the filename
                for sentence in sentences:
                    data.append({"filename": filename, "sentence": sentence})


    df = pd.DataFrame(data)

    # Save DF
    df.to_csv(df_path, index=False)

print(df.head())

Needles_Of_Disease_and_Death_Continue_In_The_Name_Of_Saving_Children.txt
Diarrhea-based_Detoxification_Hotel_By_Medical_Doctors.txt
The_FDA_Approved_5_Viruses_for_Food_Treatment.txt
Genius_Children.txt
Dr._Stanley_S._Bass_Interview.txt
Q&A_Of_September_13,_2009.txt
Causes_For_Most_Intestinal_Disease.txt
Are_Raw_Miso_And_Shoyu_Healthy_Sauces?.txt
Safe_Cutting_Boards.txt
Multiple_Lacerations_Healed_Without_Medical_Help.txt
Cholesterol,_LDL_and_HDL.txt
Primal_Diet_Workshop_+_Q&A_Of_May_6,_2000.txt
Can_We_Preserve_Raw_Chicken_In_Vinegar_Or_Lemon_Juice?.txt
Abrasions,_Fractures_and_Breaks.txt
Is_Raw_Chocolate_Made_From_Whole_Raw_Cocoa_Beans_Addictive_Or_Harmful?.txt
What_Is_Constipation_And_How_Do_We_Resolve_It?.txt
Our_Ubiquitous_Microbial_Friends.txt
Quinton.txt
Q&A_Of_December_14,_2008.txt
Q&A_Of_October_14,_2012.txt
My_Survival_Kit.txt
Medical_Propaganda_about_Inflammatory_Breast_Cancer.txt
How_Are_Nutrients_Delivered_To_Our_Cells?.txt
Q&A_Of_August_24,_2008.txt
Vaccines_Ruin_Your_Healt

With_Mercury_Found_In_Wild_Animals,_Do_We_Need_To_Be_Extra_Careful?.txt
Q&A_Of_September_26,_2010.txt
Do_You_Buy_Chicken_While_Traveling?.txt
Cancer_Convention_September_2000.txt
Q&A_Of_November_7,_1999.txt
Q&A_Of_November_26,_2006.txt
How_Bad_Are_MRIs?.txt
Arsenic_In_Poultry_Meat_And_Eggs.txt
Joanne_Unleahsed_Interview.txt
Declaring_Our_Rights_To_Our_Body.txt
We_Want_To_Live.txt
Soy_Toxicity_In_Poultry_Meat_And_Eggs.txt
Hot_Tub_Therapy.txt
Bacteria_and_Other_Microbes_Are_Responsible_for_Vibrant_Health.txt
Gum_And_Tooth_Disease.txt
Rae_Bradbury_Interview_2.txt
                                            filename  \
0  Needles_Of_Disease_and_Death_Continue_In_The_N...   
1  Needles_Of_Disease_and_Death_Continue_In_The_N...   
2  Needles_Of_Disease_and_Death_Continue_In_The_N...   
3  Needles_Of_Disease_and_Death_Continue_In_The_N...   
4  Needles_Of_Disease_and_Death_Continue_In_The_N...   

                                            sentence  
0  On Halloween, I received the most alar

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

JOBLIB_DIR = Path.cwd() / "aajonus_joblibs"
JOBLIB_DIR.mkdir(exist_ok=True)

vectorizer_path = JOBLIB_DIR / 'tfidf_vectorizer.joblib'
matrix_path = JOBLIB_DIR / 'tfidf_matrix.joblib'

max_df = 0.90
min_df = 0.00
ngram_range = (1, 3)
version=10

def custom_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

# Check if parameters have changed and files exist
params_changed = False
if vectorizer_path.exists():
    existing_vectorizer = joblib.load(vectorizer_path)
    if (existing_vectorizer.max_df != max_df or 
        existing_vectorizer.min_df != min_df or 
        existing_vectorizer.ngram_range != ngram_range):
        params_changed = True
        os.remove(vectorizer_path)
        os.remove(matrix_path)

if not matrix_path.exists() or params_changed:
    print("Fitting TF-IDF vectorizer to the dataset...")
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_df=max_df, min_df=min_df, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(df['sentence'])
    joblib.dump(vectorizer, vectorizer_path)
    joblib.dump(tfidf_matrix, matrix_path)
else:
    print("Loading fitted TF-IDF vectorizer and matrix dataset...")
    vectorizer = joblib.load(vectorizer_path)
    tfidf_matrix = joblib.load(matrix_path)

Fitting TF-IDF vectorizer to the dataset...




In [131]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, vectorizer, tfidf_matrix, df):
    query_vector = vectorizer.transform([query])  # Preprocessing is handled by vectorizer
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    top_indices = similarities.argsort()[0][-3:]

    # Retrieve the corresponding rows from the DataFrame
    top_docs = df.iloc[top_indices]
    top_scores = similarities[0][top_indices]

    return top_docs, top_scores

In [132]:
import time

test_set_columns=["Query", "Result", "Cosine", "Relevance Score", "Filename", "Date", "Max DF", "Min DF", "Ngram Range"]

def search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, test_set):
    top_docs, top_scores = search(query, vectorizer, tfidf_matrix, df)
    
    if top_docs.empty:
        print("No documents found for this query.")
        return test_set

    new_rows = []
    for (index, row), score in zip(top_docs.iterrows(), top_scores):
        new_row = {
            "Query": query,
            "Result": row['sentence'],
            "Cosine": score,
            "Filename": row['filename'],
            "Relevance Score": 0,  # Placeholder for manual scoring
            "Date": pd.Timestamp('now'),
            "Max DF": max_df,
            "Min DF": min_df,
            "Ngram Range": ngram_range
        }
        new_rows.append(new_row)
    
    # Create a DataFrame from the new_rows list
    new_rows_df = pd.DataFrame(new_rows)
    
    # Append the new_rows_df to the test_set
    test_set = pd.concat([test_set, new_rows_df], ignore_index=True)
    
    return test_set

In [133]:
def generate_test_set_from_queries(query_file_path, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range):
    test_set = pd.DataFrame(columns=test_set_columns)
    with open(query_file_path, 'r') as file:
        queries = file.read().splitlines()
    
    for query in queries:
        test_set = search_main(query, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range, test_set)
    
    return test_set

TEST_SET_DIR = Path.cwd() / "aajonus_test_sets"
TEST_SET_DIR.mkdir(exist_ok=True)
test_set_path = TEST_SET_DIR / f"test_set_v{version}.csv"
query_file_path = TEST_SET_DIR / "queries.txt"

test_set = generate_test_set_from_queries(query_file_path, vectorizer, tfidf_matrix, df, max_df, min_df, ngram_range)

test_set.to_csv(test_set_path, index=False)

In [134]:
# After creating the test_set and evaluating the relevance of the top 5, we now need to compute some metrics. 

from sklearn.metrics import precision_score, recall_score, f1_score

def compute_evaluations(test_set, threshold=0.5):
    evaluation_data = []
    
    # This loops over each unique column name
    for query in test_set['Query'].unique():
        # Then we create a new df which filters the test_set on the query col
        # This df will be 5 rows
        current_query_data = test_set[test_set['Query'] == query]
        
        # We then extract out the columns and create numpy arrays from the respective values 
        true_relevance = current_query_data['Relevance Score'].to_numpy()
        cosine_scores = current_query_data['Cosine'].to_numpy()
        
        # Convert cosine scores to binary predictions
        predicted_relevance = (cosine_scores >= threshold).astype(int)
        
        # print(f"True Relevance: {true_relevance}")
        # print(f"Cosine scores: {cosine_scores}")
        # print(f"Predicted Relevance: {predicted_relevance}")
        
        # Calculate precision, recall, and F1-score with zero_division parameter
        # Precison: True Pos / (True Pos + False Pos)
        # Recall: True Pos / (True Pos + False Neg)
        # Harmonic mean: 2 * (Prec x Rec / (Prec + Rec))
        precision = precision_score(true_relevance, predicted_relevance, zero_division=0)
        recall = recall_score(true_relevance, predicted_relevance, zero_division=0)
        f1 = f1_score(true_relevance, predicted_relevance, zero_division=0)
        
        # print(f"Precision: {precision}")
        # print(f"Recall: {recall}")
        # print(f"F1: {f1}")

        evaluation_data.append({
            'Query': query,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })
    
    eval_df = pd.DataFrame(evaluation_data)
    
    return eval_df

EVAL_DIR = Path.cwd() / "aajonus_evaluations"
EVAL_DIR.mkdir(exist_ok=True)

test_set_path = TEST_SET_DIR / f"test_set_v{version}_scored.csv"
test_set = pd.read_csv(test_set_path)

eval_df = compute_evaluations(test_set)
eval_path = EVAL_DIR / f"evaluation_v{version}.csv"
eval_df.to_csv(eval_path, index=False)

print(eval_df.head())

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ethancavill/Documents/code/nlp/aajonus_project/aajonus_test_sets/test_set_v10_scored.csv'