## Disclaimer
#### This is my personal repository, this is not Google approved code
#### Most of the code was taken from the documentation of the modules used

In [1]:
import urllib.request

pdf_url = 'https://www.toyota.com/t3Portal/document/om-s/OM60R53U/pdf/OM60R53U.pdf'
webUrl  = urllib.request.urlopen(pdf_url)
pdf_data = webUrl.read()

In [3]:
from io import StringIO, BytesIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


output_string = StringIO()
parser = PDFParser(BytesIO(pdf_data))
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)

pdf_text = output_string.getvalue()

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

spacy_doc = nlp(pdf_text)
word_list_raw = [i.text.upper() for i in spacy_doc if i.string.isalpha() == True] #isalnum
word_count = len(word_list_raw)
print(f'{word_count} words were found')

23377 words were found


In [7]:
blacklist = ['CARDINAL', 'QUANTITY', 'DATE', 'TIME', 'PERCENT']
entities = set([(ent.text, ent.label_) for ent in spacy_doc.ents if ent.label_ not in blacklist])
list(entities)[:10]

[('essary', 'GPE'),
 ('the State of California', 'ORG'),
 ('Trip meters', 'FAC'),
 ('Rear', 'ORG'),
 ('U.S.A', 'GPE'),
 ('SAE', 'ORG'),
 ('Federal\n\nMotor Vehicle Safety Standards', 'ORG'),
 ('sary', 'PERSON'),
 ('Navigation', 'ORG'),
 ('Headlight System Malfunction Visit Your Dealer', 'WORK_OF_ART')]

In [12]:
import textacy
import collections
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

spacy_lang = textacy.load_spacy_lang('en')
textacy_doc = textacy.make_spacy_doc(pdf_text)

bigrams = list(textacy.extract.ngrams(spacy_doc, 2, filter_stops=True, filter_punct=True, filter_nums=True, min_freq=2))
bigrams = [i.string.upper().strip() for i in bigrams]
bigrams_counts_d = dict(collections.Counter(bigrams))
bigrams_counts_df = pd.DataFrame.from_dict(bigrams_counts_d, orient='index', columns=['count']).reset_index().rename(columns={'index': 'word'})

bigrams_counts_df.head()

Unnamed: 0,word,count
0,PICTORIAL INDEX,15
1,WARNING LIGHTS,19
2,INTERIOR FEATURES,31
3,VEHICLE SPECIFICATIONS,3
4,CUSTOMIZABLE FEATURES,17


In [13]:
from textacy import preprocessing

clean_text_1 = preprocessing.remove_punctuation(pdf_text)
clean_text_2 = preprocessing.normalize_quotation_marks(clean_text_1)
clean_text_3 = preprocessing.normalize_hyphenated_words(clean_text_2)
clean_text_4 = preprocessing.normalize_whitespace(clean_text_3)

print(pdf_text[-300:], '\n***** CLEANED BELOW *****\n')
print(clean_text_4[-300:])

leaded gasoline only

Cold tire inflation 
pressure

Engine oil capacity
(Drain and refill — 
reference)

Engine oil type

With filter
Without filter

7.9 qt. (7.5 L, 6.6 Imp.qt.)
7.5 qt. (7.1 L, 6.2 Imp.qt.)

“Toyota Genuine Motor Oil” or equivalent

P. 570

P. 574

P. 570

LC200_OM_OM60R53U_(U)

 
***** CLEANED BELOW *****

 L 20 4 Imp gal 
Fuel type
Unleaded gasoline only
Cold tire inflation 
pressure
Engine oil capacity
 Drain and refill 
reference 
Engine oil type
With filter
Without filter
7 9 qt 7 5 L 6 6 Imp qt 
7 5 qt 7 1 L 6 2 Imp qt 
 Toyota Genuine Motor Oil or equivalent
P 570
P 574
P 570
LC200 OM OM60R53U U


In [14]:
import textacy.ke
textacy.ke.textrank(textacy_doc, normalize="lemma", topn=10)

 ('automatic light control system', 0.006426136496327587),
 ('passenger vehicle tire', 0.00619370440178992),
 ('vehicle distance control mode', 0.0061133277897856015),
 ('Impact detection door lock release system', 0.00586548902076033),
 ('vehicle system', 0.005704889425308061)]

In [15]:
terms = textacy_doc._.to_bag_of_terms(ngrams=(2, 3), entities=True, weighting="count", as_strings=True)
dict(list(terms.items())[0:10])

{'pictorial index': 15,
 '4 drive': 2,
 '5 Interior': 2,
 'Interior feature': 3,
 'interior feature': 28,
 'vehicle specification': 3,
 'customizable feature': 16,
 'U.S. owner': 8,
 'report safety': 5}