# PDF Auto Search Proof of Concept

In [None]:
#conda install pymupdf

In [None]:
#import fitz  # from PyMuPDF - for converting PDFs to text data
import pandas as pd  # for dataframe manipulation
import spacy # for finding similar words
import os  # for getting list of files in a directory and looking only at PDFs
import nltk  # for stemming the keywords

In [None]:
# what files are in the working directory?
list_of_filenames = os.listdir()
list_of_filenames

## Loop Through PDFs in Directory and Read Text into Dataframe

In [None]:
# for each pdf, read in the text to a new row in a dataframe
df = pd.DataFrame(columns=["filename", "text", "num_pages"])
for filename in os.listdir():
    if filename.endswith('.pdf'):
        doc = fitz.open(filename)
        num_pages = doc.pageCount  
        doc_df = pd.DataFrame({"filename":filename, 
                               "text":"", 
                               "num_pages":num_pages}, index=[0])
        doc_text = ""
        # get the text from each page in the document
        for idx, page in enumerate(doc):
            page = doc.loadPage(idx)
            page_text = page.getText("text")
            doc_text = doc_text + page_text
        # store the document text in the document dataframe
        doc_df["text"] = doc_text 
        # append the document df to the folder df
        df = df.append(doc_df)

# reset the index
df = df.reset_index(drop=True) 

# create lower case version of the text
df["text_lower"] = df["text"].str.lower()

# view first 2 rows of dataframe showing the text for each file
df.head(2).style.set_properties(subset=['text','text_lower'], **{'width': '400px'})

## Define Keywords to Search For (In Lower Case)

In [None]:
search_terms = ["internet", "technology", "access", "mobile phone", "coverage", "smartphone"]

## Stem Keywords

Get the stemmed versions of the keywords provided and **add** them to the keywords list.

e.g. change 'enrolment' to 'enrol' so that all the versions of the word can be found e.g. enrolled

**Note**: the stemmed versions will be added to the keywords list rather than replacing the keywords lkist because sometimes the stemmed version excludes the original search term. For example, using the Porter stemming method, 'literacy' would be stemmed to 'literaci' and therefore 'literacy' would not be identified in the documents.

In [None]:
#call the nltk downloader and download the 'punkt' model
#nltk.download()

In [None]:
# import the package for doing stemming using the Porter method
from nltk.stem import PorterStemmer

#create an object of class PorterStemmer
porter = PorterStemmer()

In [None]:
# define a function to read in a list of words, 
# loop through each word, and for each word
# obtain the stemmed version.

def get_stemmed_words(list_of_search_terms):
    
    stemmed_search_terms = []
    
    for word in list_of_search_terms:
        
        # get stemmed version of the word
        stemmed_word = porter.stem(word)
        
        # add the stemmed version to the list of search terms
        stemmed_search_terms.append(stemmed_word)
    
    return stemmed_search_terms

In [None]:
# print the original list of search terms
print("Original search terms:", search_terms)

In [None]:
# call the function on the list of search terms 
stemmed_terms = get_stemmed_words(search_terms)

# view the stemmed list of search terms
print("Stemmed search terms:", stemmed_terms)

## Get Similar Words

e.g. keyword = "education", similar words = school, literacy etc?

We could use this functionality to add on / suggest keywords for the search in addition to the keywords supplied by the Social Scientist.

In [None]:
# create a spacy object based on the English model
nlp = spacy.load('en_core_web_md')

In [None]:
# define a function to get the x most similar words to a word
def most_similar(word, topn=2):
    word = nlp.vocab[str(word)]
    queries = [
        w for w in word.vocab 
        if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
    ]

    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
        

In [None]:
# create function to receive a list of words and return the 
# top 2 similar words for each word in the list

def get_similar_words(list_of_words):
    
    all_similar_words = []
    
    for word in list_of_words:
        spacy_word = nlp.vocab[str(word)]
        if spacy_word.has_vector:
        
            # find similar words to the word, and store them in a dataframe along with their scores
            similar_words = pd.DataFrame(most_similar(word, topn=2), columns=["word", "similarity_score"])

            # save the list of similar words
            similar_words_list = list(similar_words["word"])

            # append the list of similar words to the list to be returned
            all_similar_words.append(similar_words_list)
        
    # flatten the list of lists to one list
    all_similar_words = [item for sublist in all_similar_words for item in sublist]
    
    # remove duplicates from the list
    all_similar_words = list(dict.fromkeys(all_similar_words))
    
    # sort list in alphabetical order
    all_similar_words.sort()

    return all_similar_words


# test the function
get_similar_words(search_terms)

## UK and US Spellings

We could auto-find alternative spellings e.g. if Social Scientist searched for 'Labor' we could also add 'Labour' as a search term.

UK and US spelling pairs are available here: http://www.tysto.com/uk-us-spelling-list.html

In [None]:
uk_us_spelling_pairs = pd.read_csv('uk_us_spellings.csv')
uk_us_spelling_pairs.head()

In [None]:
# create function to receive a list of words and return the 
# alternative spellings if there are any
def get_alternative_spellings(list_of_words):
    alt_spellings = []
    
    for word in list_of_words:
        # if word is in the uk_spelling list
        # then save the us_spelling
        temp_list = list(uk_us_spelling_pairs.loc[uk_us_spelling_pairs["uk_spelling"]==word, "us_spelling"])
        if len(temp_list)>0:
            alt_spellings.append(temp_list)
        
        # if the word is in the us_spelling list
        # then save the uk_spelling
        temp_list = list(uk_us_spelling_pairs.loc[uk_us_spelling_pairs["us_spelling"]==word, "uk_spelling"])
        if len(temp_list)>0:
            alt_spellings.append(temp_list)
            
    # flatten list
    alt_spellings = [item for sublist in alt_spellings for item in sublist]
    
    # remove duplicates from the list
    alt_spellings = list(dict.fromkeys(alt_spellings))
    
    # sort list in alphabetical order
    alt_spellings.sort()
    
    return alt_spellings

# test the function
get_alternative_spellings(search_terms + ["color"])

## Extract Entities Per File

- organisations
- money
- dates
- people
- URLs
- References

In [None]:
extracted_entities = pd.DataFrame(columns=["filename", "entity", "entity_type"], index=[0])

filenames = df["filename"]
texts = df["text_lower"]

for idx, filename in enumerate(filenames):
    
    text = texts[idx]
    doc = nlp(text)
    
    for ent in doc.ents: 
        temp_df = pd.DataFrame({"filename":filename, "entity":ent.text, "entity_type":ent.label_}, index=[0])
        extracted_entities = extracted_entities.append(temp_df, ignore_index=True)

# remove rows with no values
extracted_entities = extracted_entities.dropna()

# remove duplicates
extracted_entities = extracted_entities.drop_duplicates(keep='first')

In [None]:
print(extracted_entities.shape)
print(extracted_entities.groupby("filename").size())
print(extracted_entities.groupby("entity_type").size())
extracted_entities.head(10)

In [None]:
# save the extracted entities to a csv
extracted_entities.to_csv('extracted_entities.csv')

**Note** - this entity extraction is far from perfect. Would be useful to look into ways to improve it.

## Create Dataframe of Files, Pages and Keywords Found

In [None]:
# find similar words to the search terms and add them to the list of search terms
all_search_terms = search_terms + get_similar_words(search_terms)

# find alternative spellings of search terms and add them to the list
all_search_terms = all_search_terms + get_alternative_spellings(all_search_terms)

# stem the list of search terms and add them to the list
all_search_terms = all_search_terms + get_stemmed_words(all_search_terms)

# remove duplicates from the list of search terms
all_search_terms = list(dict.fromkeys(all_search_terms))

# sort the list of search terms in alphabetical order
all_search_terms.sort()

# view the updated list of search terms
print("Updated search terms list:", all_search_terms)

In [None]:
# edit updated keyword list to remove irrelevant words and add any other relevant words manually
all_search_terms = ['internet', 
                    'technology', 'technolog',
                    'access', 
                    'mobile phone', 
                    'coverag', 'coverage', 
                    'smartphone', 'smart phone', 'handset', 'hand-set', 'hand set', 'cell phone', 'cellphone', 
                    'online', 'web', 'wifi', 'wi-fi','comput', 'digital', 'laptop', 'connect']

In [None]:
files_found_list = []
page_nums=[]

# loop through the search terms
for search_term in all_search_terms:
    # make a note of files that contain the search term
    files_found_list.append(list(df.loc[df["text_lower"].str.contains(search_term), "filename"]))

# flatten the list of lists to one list
files_found_list = [item for sublist in files_found_list for item in sublist]

# if there was at least one file containing one of the search terms
if len(files_found_list) > 0:
    file_df = pd.DataFrame(columns=["filename", "keyword", "page"], index=[0])
    
    # loop through the files containing the keyword and get the list of page numbers containing the keyword
    for file in files_found_list:
        filename = file
        pdf_document = fitz.open(filename)
        num_pages_in_file = len(pdf_document)
        
        for current_page in range(len(pdf_document)):
            page = pdf_document.loadPage(current_page)
            page_text = page.getText("text")
            page_text = page_text.lower()
            
            for search_term in all_search_terms:
                if page.searchFor(search_term):
                    page_nums.append(current_page+1)
                    page_keyword_df = pd.DataFrame({"filename":filename, 
                                                    "keyword":search_term,
                                                    "page":current_page+1}, 
                                                   index=[0])
                    file_df = file_df.append(page_keyword_df)
                
else:
    print("Search terms not found in the documents provided.")

file_df = file_df.dropna()
file_df = file_df.drop_duplicates(keep='first')
file_df = file_df.reset_index(drop=True)
file_df = file_df.sort_values(by=["filename", "keyword", "page"])
file_df.to_csv('file_df.csv')

In [None]:
# view first 10 rows of pages per keyword per file
file_df.head(10).style.set_properties(subset=['filename'], **{'width': '600px'})

### Performance Note

For 2 initial search words, and **10** similar words, this took **10 mins** for 3 MICS reports (~300 to ~500 pages each) plus one 50 page document, one 3 page document, and 3 documents with 1 page each. i.e. **approx 1200 pages**.

For 2 initial search words, and **5** similar words, and stemming, this took **8 mins 51 s** for the same reports as above.

For 2 initial search words, and **3** similar words, and stemming, this took **6 mins 27 s** for the same reports as above

For 2 initial search words, and **2** similar words, and stemming, this took **5 mins 8 s** for the same reports as above

For **3** initial search words, and **2** similar words, and stemming, this took **9 mins 13 s** for the same reports as above.

**Outstanding Question:** How could we optimise the code so that it runs faster?

In [None]:
# get number of "hits" (keyword-pages) per file 
hits_per_file = pd.DataFrame(file_df.groupby("filename")["keyword"].count()).reset_index()
hits_per_file = hits_per_file.rename(columns={"keyword":"hits"})
hits_per_file = hits_per_file.sort_values(by=["hits", "filename"], ascending=False)

# add number of pages as a column
hits_per_file = hits_per_file.merge(df[["filename", "num_pages"]])

# add hits/pages as a column
hits_per_file["hits/pages ratio"] = hits_per_file["hits"] / hits_per_file["num_pages"]

# save to a csv
hits_per_file.to_csv('hits_per_file.csv')

# view the hits per file
hits_per_file.style.set_properties(subset=['filename'], **{'width': '600px'})

In [None]:
# get unique keywords per file
keywords_by_file = file_df[["filename", "keyword"]].drop_duplicates(keep='first')
keywords_by_file.to_csv('keywords_by_file.csv')

# view the keyword per file
keywords_by_file.style.set_properties(subset=['filename'], **{'width': '600px'})

In [None]:
# get relevant pages per file
pages_by_file = file_df[["filename", "page"]].drop_duplicates(keep='first')
pages_by_file.to_csv('pages_by_file.csv')

# view the relevant pages per file (first 10 rows)
pages_by_file.head(10).style.set_properties(subset=['filename'], **{'width': '600px'})