# The Robot Reviewer 🤖
Here, we download a HuggingFace library model and use OCR on PDFs to drag out relevant content. The goal is to take any number of PDFs (even folders of them) and mark the most interesting parts using an array of settings that makes it clear what you're looking for, e.g. size of output edges, specific search terms, etc.

In [33]:
# Extract text from the PDF
import textract
import re
import spacy
import numpy as np
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [32]:
def MostSimilar(word, topn=5):
    word = nlp.vocab[str(word)]
    queries = [
        w for w in word.vocab 
        if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
    ]

    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]

def Pipeline(pdf_path):
    if ".pdf" not in pdf_path:
        print("Not a .pdf file")
        return None

    # Extract text from PDF
    text = textract.process(pdf_path)
    text = text.decode("utf-8")
    # Remove \r and \n
    text = re.sub("\\n\\r\\t", " ", text)
    return text

def Reviewer(keywords, pdf_path):
    text = Pipeline(pdf_path)
    result = nlp(text)

    for word in keywords:
        print(MostSimilar(word))


    return result


In [40]:
# doc = Reviewer(["AI", "game"], "test.pdf")

# print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
# print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# for entity in doc.ents:
#     print(entity.text, entity.label_)


[]