## Disclaimer
#### This is my personal repository, this is not Google approved code
#### Most of the code was taken from the documentation of the modules used

In [2]:
from pprint import pprint as prt
import pandas as pd
pd.set_option('display.max_rows', 100)

from pprint import pprint as prt
import collections
from io import StringIO, BytesIO

import warnings
warnings.filterwarnings('ignore')

import spacy
nlp = spacy.load('en_core_web_lg')

import textacy

# How to build a spelling dictionary with industry terms
### Download the top 10k most common used words
#### This is derived from Google's Trillion Word Corpus

In [4]:
import urllib

top_10k_american_english = 'https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-usa.txt'
webUrl  = urllib.request.urlopen(top_10k_american_english)
top_10k_ae = webUrl.read()
top_10k_ae = top_10k_ae.decode('utf-8').upper().split('\n')
top_10k_ae[:10]


['THE', 'OF', 'AND', 'TO', 'A', 'IN', 'FOR', 'IS', 'ON', 'THAT']

### Let's mine a PDF for industry specific words
#### You'll want to use a corpus of documents to mine for words & terms
#### Be sure to pick source documents that have a high likelyhood of spelling accuracy

In [6]:
pdf_url = 'https://www.toyota.com/t3Portal/document/om-s/OM60R53U/pdf/OM60R53U.pdf'
webUrl  = urllib.request.urlopen(pdf_url)
pdf_data = webUrl.read()

In [7]:
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


output_string = StringIO()
parser = PDFParser(BytesIO(pdf_data))
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)

pdf_text = output_string.getvalue()

### Extract and normalize words
#### This example does not extract words that have numbers or special characters embedded
#### Words are uppercased to standardize for matching

In [8]:
spacy_doc = nlp(pdf_text)
word_list_raw = [i.text.upper() for i in spacy_doc if i.string.isalpha() == True] #isalnum
word_count = len(word_list_raw)
print(f'{word_count} words were found')

23377 words were found


#### Get word frequency counts
#### If you use a large enough corpus you should see a delinator for misspelled words
#### An example would be removing any words that have less then 3 occurrences
#### When you look through the words you should see examples of singular & plural words
#### we will convert the plural words to singular using lemmatization

In [9]:
word_list_raw_df = pd.DataFrame(word_list_raw, columns=['word'])
word_list_raw_df['count'] = 1
word_list_raw_counts = word_list_raw_df.groupby('word').count().reset_index()
word_list_raw_counts.sort_values('count', ascending=False, inplace=True )
# remove words that are to short
word_list_raw_counts = word_list_raw_counts[ word_list_raw_counts['word'].str.len() > 2 ]
word_list_raw_counts.head(20)

Unnamed: 0,word,count
2195,THE,525
2377,VEHICLE,443
1489,OFF,283
2156,SYSTEM,281
1390,MODE,175
2409,WARNING,165
660,DISPLAY,163
2157,SYSTEMS,161
1421,MULTI,154
779,ETC,151


In [78]:
word_list_raw_counts.tail(20)

Unnamed: 0,word,count
1707,PURPOSE,1
801,EXPLOSIONS,1
1710,PUSHING,1
800,EXPLODE,1
1731,RAINDROPS,1
799,EXPLANATION,1
1714,QUALITIES,1
798,EXPLAINED,1
1716,QUANTITY,1
1717,QUICKLY,1


#### Check if word is plural and build a hash lookup

In [10]:
replacements = {}

for token in spacy_doc:
    if len(token.text.replace('\n', '')) > 1:
        word = token.text.upper()
        singular = token.lemma_.upper()
        if word != singular:
            replacements[word] = singular
            

dict(list(replacements.items())[0:10])

{'THEM': '-PRON-',
 'GAUGES': 'GAUGE',
 'METERS': 'METER',
 'LIGHTS': 'LIGHT',
 'INDICATORS': 'INDICATOR',
 'OPENING': 'OPEN',
 'CLOSING': 'CLOSE',
 'DOORS': 'DOOR',
 'WINDOWS': 'WINDOW',
 'DRIVING': 'DRIVE'}

In [11]:
auto_dictionary = set(word_list_raw_counts['word'])

for k,v in replacements.items():
    if v != '-PRON-': # Remove PRON as it will throw off modeling
        try:
            auto_dictionary.remove(k)
            auto_dictionary.add(v)
        except:
            pass

list(auto_dictionary)[:10]

['PURCHASE',
 'FRENCH',
 'POWER',
 'DUST',
 'OUTBOARD',
 'BASE',
 'PLATE',
 'FOOT',
 'SKID',
 'NAVIGATION']

#### Join the 10k & auto industry lists

In [56]:
spelling_list = list(set(top_10k_ae + list(auto_dictionary)))
print('10k\t', len(top_10k_ae), '\nAuto\t', len(list(auto_dictionary)), '\ntotal\t', len(spelling_list))

10k	 10000 
Auto	 1861 
total	 10549


#### Create a custom spell checker

In [None]:
### Reddit comments to check with spelling list

In [18]:
%%bigquery words_raw

WITH comments AS (SELECT SPLIT(REPLACE(body, '\n', ' '), ' ') AS words
                  FROM `fh-bigquery.reddit_comments.20*`
                  WHERE    UPPER(subreddit) LIKE '%AUTOMOTIVE%'
                        OR UPPER(subreddit) LIKE '%MECHANIC%'),
  word_list  AS (SELECT   UPPER(words) AS word
                  FROM    comments,
                          UNNEST(words) as words
                  WHERE   REGEXP_CONTAINS(words, '[^\\w]') = False)

SELECT word, count(*) as freq
FROM word_list
WHERE ABS(MOD(FARM_FINGERPRINT(word), 10)) < 3
GROUP BY 1
ORDER BY freq DESC


In [79]:
words_raw.tail()

Unnamed: 0,word,freq
93201,B235,1
93202,CIRCUSTANCE,1
93203,TOTLA,1
93204,ISSIED,1
93205,COLORWAYZ,1


In [62]:
dictionary_df = pd.DataFrame(words_series, columns=['word'])
dictionary_df['match'] = True
dictionary_df.sort_values('word')

Unnamed: 0,word,match
5221,A,True
3539,AA,True
2700,AAA,True
934,AARON,True
2245,AB,True
...,...,...
10020,ZOPE,True
4827,ZSHOPS,True
79,ZU,True
2711,ZUM,True


In [64]:
words_raw['word']     = words_raw['word'].apply(lambda x: x.strip())
dictionary_df['word'] = dictionary_df['word'].apply(lambda x: x.strip())
check = pd.merge(words_raw, dictionary_df, on='word', how='left')
check

Unnamed: 0,word,freq,match
0,THE,9336398,True
1,A,5198960,True
2,I,4829875,True
3,AND,3934071,True
4,IS,2541116,True
...,...,...,...
93201,B235,1,
93202,CIRCUSTANCE,1,
93203,TOTLA,1,
93204,ISSIED,1,


In [75]:
check[(check['match'] != True) & (check['freq'] > 100)]

Unnamed: 0,word,freq,match
77,DUCKY,82018,
117,GATERON,49817,
119,TOPRE,48968,
122,TKL,48280,
126,5,47448,
...,...,...,...
5996,INVALIDATE,101,
5997,386,101,
5998,ACGAM,101,
5999,TINNING,101,


In [80]:
import enchant

words_series = pd.Series(list(spelling_list))
words_series.to_csv('auto_words_custom_dic.txt', index=False)

auto_spell_checker = enchant.PyPWL('auto_words_custom_dic.txt')

print('Check word in list', auto_spell_checker.check('HOMELINK'))
print('Check word not in list', auto_spell_checker.check('HOMLUNK'))

print('\nSuggestions', auto_spell_checker.suggest('HOMLUNK'))

Check word in list True
Check word not in list False

Suggestions ['HOMELINK', 'HOMELAND', 'HOLLAND', 'HOME', 'HOMES', 'HOMELESS', 'HOMEWORK', 'HOL', 'HOLE', 'HOLY']
