# Auto Search PDF Tool With Additional Similar Search Terms

(A Google Colab notebook.)

1. Upload the PDFs you want to search
2. Update the list of keywords (in the cell containing "all_search_terms...") to meet your needs
3. Run the cells
4. Once the following files appear in the pane to the left, you can download them by hovering over the filename, clicking on the three dots, then clicking 'download'

- file_df.csv
- hits_per_file.csv
- keywords_by_file.csv
- pages_by_file.csv

In [None]:
!pip install PyMuPDF



In [None]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()
import numpy as np
import fitz
import nltk
import os 
import pandas as pd

# what files are in the working directory?
# if you've not uploaded the PDFs then do that now!
list_of_filenames = os.listdir()
list_of_filenames

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


['.config',
 'Will blockchain emerge as a tool to break the poverty chain in the Global South.pdf',
 '.ipynb_checkpoints',
 'Technology Adoption Dynamics of the Press Workers in Bangladesh.pdf',
 'sample_data']

In [None]:
all_search_terms = ["technology", "internet", "smartphone"]

## Get similar search terms

In [None]:
# define a function to get the x most similar words to a word
def most_similar(word, topn=2):
    print(word)
    word = nlp.vocab[str(word)]
    print(word)
    queries = [
        w for w in word.vocab 
        if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
    ]

    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]

In [None]:
# create function to receive a list of words and return the 
# top 2 similar words for each word in the list

def get_similar_words(list_of_words):
    
    all_similar_words = []
    
    for word in list_of_words:
        spacy_word = nlp.vocab[str(word)]
        if spacy_word.has_vector:
        
            # find similar words to the word, and store them in a dataframe along with their scores
            similar_words = pd.DataFrame(most_similar(word, topn=2), columns=["word", "similarity_score"])

            # save the list of similar words
            similar_words_list = list(similar_words["word"])

            # append the list of similar words to the list to be returned
            all_similar_words.append(similar_words_list)
        
    # flatten the list of lists to one list
    all_similar_words = [item for sublist in all_similar_words for item in sublist]
    
    # remove duplicates from the list
    all_similar_words = list(dict.fromkeys(all_similar_words))
    
    # sort list in alphabetical order
    all_similar_words.sort()

    return all_similar_words


In [None]:
# run the function on the search terms entered by the user
new_search_terms = get_similar_words(all_search_terms)
new_search_terms

technology
<spacy.lexeme.Lexeme object at 0x7f418c945eb0>
internet
<spacy.lexeme.Lexeme object at 0x7f418c9eafa0>
smartphone
<spacy.lexeme.Lexeme object at 0x7f418c7e7eb0>


['handset', 'online', 'smartphones', 'technological', 'technologies', 'web']

In [None]:
all_search_terms = all_search_terms + new_search_terms
all_search_terms

['technology',
 'internet',
 'smartphone',
 'handset',
 'online',
 'smartphones',
 'technological',
 'technologies',
 'web']

## Run the search

In [None]:
# for each pdf, read in the text to a new row in a dataframe
df = pd.DataFrame(columns=["filename", "text", "num_pages"])
for filename in os.listdir():
    if filename.endswith('.pdf'):
        doc = fitz.open(filename)
        num_pages = doc.pageCount  
        doc_df = pd.DataFrame({"filename":filename, 
                               "text":"", 
                               "num_pages":num_pages}, index=[0])
        doc_text = ""
        # get the text from each page in the document
        for idx, page in enumerate(doc):
            page = doc.loadPage(idx)
            page_text = page.getText("text")
            doc_text = doc_text + page_text
        # store the document text in the document dataframe
        doc_df["text"] = doc_text 
        # append the document df to the folder df
        df = df.append(doc_df)

# reset the index
df = df.reset_index(drop=True) 

# create lower case version of the text
df["text_lower"] = df["text"].str.lower()


files_found_list = []
page_nums=[]

# loop through the search terms
for search_term in all_search_terms:
    # make a note of files that contain the search term
    files_found_list.append(list(df.loc[df["text_lower"].str.contains(search_term), "filename"]))

# flatten the list of lists to one list
files_found_list = [item for sublist in files_found_list for item in sublist]

# if there was at least one file containing one of the search terms
if len(files_found_list) > 0:
    file_df = pd.DataFrame(columns=["filename", "keyword", "page"], index=[0])
    
    # loop through the files containing the keyword and get the list of page numbers containing the keyword
    for file in files_found_list:
        filename = file
        pdf_document = fitz.open(filename)
        num_pages_in_file = len(pdf_document)
        
        for current_page in range(len(pdf_document)):
            page = pdf_document.loadPage(current_page)
            page_text = page.getText("text")
            page_text = page_text.lower()
            
            for search_term in all_search_terms:
                if page.searchFor(search_term):
                    page_nums.append(current_page+1)
                    page_keyword_df = pd.DataFrame({"filename":filename, 
                                                    "keyword":search_term,
                                                    "page":current_page+1}, 
                                                   index=[0])
                    file_df = file_df.append(page_keyword_df)
                
else:
    print("Search terms not found in the documents provided.")

file_df = file_df.dropna()
file_df = file_df.drop_duplicates(keep='first')
file_df = file_df.reset_index(drop=True)
file_df = file_df.sort_values(by=["filename", "keyword", "page"])
file_df.to_csv('file_df.csv')

# get number of "hits" (keyword-pages) per file 
hits_per_file = pd.DataFrame(file_df.groupby("filename")["keyword"].count()).reset_index()
hits_per_file = hits_per_file.rename(columns={"keyword":"hits"})
hits_per_file = hits_per_file.sort_values(by=["hits", "filename"], ascending=False)

# add number of pages as a column
hits_per_file = hits_per_file.merge(df[["filename", "num_pages"]])

# add hits/pages as a column
hits_per_file["hits/pages ratio"] = hits_per_file["hits"] / hits_per_file["num_pages"]

# save to a csv
hits_per_file.to_csv('hits_per_file.csv')

# get unique keywords per file
keywords_by_file = file_df[["filename", "keyword"]].drop_duplicates(keep='first')
keywords_by_file.to_csv('keywords_by_file.csv')

# get relevant pages per file
pages_by_file = file_df[["filename", "page"]].drop_duplicates(keep='first')
pages_by_file.to_csv('pages_by_file.csv')