In [23]:
import nltk
from nltk import word_tokenize
from nltk.corpus import words
from nltk.metrics.distance import (
    edit_distance,
    jaccard_distance,
    )
from nltk.util import ngrams
nltk.download('words')
import pandas as pd
import numpy as np
import enchant
d = enchant.Dict("en_UK")

from tqdm import tqdm

[nltk_data] Downloading package words to /home/elubrini/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [24]:
with open('../data/onto_taxa.txt','r') as f:
    correct_spellings = f.read().split("\n")

In [25]:
spellings_series = pd.Series(correct_spellings)
spellings_series

0                                00003A/5
1                                 0016J/2
2                                 001 Na2
3                                 001-Na2
4                                  003A/5
                        ...              
10310736                    Z . zygospora
10310737                      Z.zygospora
10310738                     Z. zygospora
10310739    Z. Z. Zhou & Y. J. Min 090512
10310740                                 
Length: 10310741, dtype: object

In [26]:

def jaccard(entry, gram_number, spellings_series):
    spellings = spellings_series[spellings_series.str.startswith(entry[0])] 
    distances = ((jaccard_distance(set(ngrams(entry, gram_number)),
                                    set(ngrams(word, gram_number))), word)
                    for word in spellings)
    try:
        closest = min(distances)
        outcome = (closest[1])
    except ValueError:
        outcome = entry    
    return outcome

In [27]:
def correct(entry, gram_number, spellings_series):
    if d.check(entry):
        return entry
    else:
        return jaccard(entry, gram_number, spellings_series)

def correct_text(text, gram_number, spellings_series, df_out=False, save=False):
    entries = word_tokenize(text)
    corrections = []
    for entry in tqdm(entries):
        correction = correct(entry, gram_number, spellings_series)
        corrections.append(correction)
    if df_out:
        dic = dict(entries=entries,corrections=corrections)
        df = pd.DataFrame(dic)
        df['diff'] = np.where( df['entries'] == df['corrections'] , 1, 0)
        df[df['diff'] == 0]
        return df
    else:
        return corrections

In [28]:
with open('../data/sample_doc.txt', 'r') as f:
    sample = f.read()

In [29]:
correction = correct_text(sample, 3, spellings_series, df_out=True)
correction.head()

100%|██████████| 1462/1462 [1:28:45<00:00,  3.64s/it]


Unnamed: 0,entries,corrections,diff
0,,,1
1,10,1 08-B(F),0
2,LI,LI,1
3,VIO,V . abbotti,0
4,T,T,1


In [30]:
correction[correction['diff']==0][['entries','corrections']]

Unnamed: 0,entries,corrections
1,10,1 08-B(F)
3,VIO,V . abbotti
5,AMANINI,AMAND7
13,PSILLIDI,PSI07
17,Psgllina,Puellina
...,...,...
1453,A.Via,"Aulonemia cincta P.L.Viana & Filg., 2014"
1454,Monticelli,Monticellia
1456,IlGENOVA,Influenza A virus (A/GENOA/4/2002(H3N2))
1457,``,`ohai
