In [1]:
#importing libraries
import glob
import fileinput
import re
from nltk import word_tokenize
from nltk.util import ngrams

In [2]:
#read the files in a sorted order
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [3]:
#noisy-dataset path
data_path = '/Users/muntabir/Desktop/Graduate-School-ODU/CS895/project/source/noisy/*.txt'
#clean-dataset path
dict_path = '/Users/muntabir/Desktop/Graduate-School-ODU/CS895/project/source/clean/*txt'
noisy_files = sorted(glob.glob(data_path), key=numericalSort)
clean_files = sorted(glob.glob(dict_path), key=numericalSort)

In [4]:
#write all the noisy data files from a directory and merged it in a sorted order
with open('noisy_data.txt', 'w', encoding = 'utf-8') as outfile:
    for lines in fileinput.input(noisy_files):
        outfile.write(lines)  

#write all the clean data files from the dicrectory and merged it in a sorted order
with open('dictionary.txt', 'w', encoding = 'utf-8') as outfile:
    for lines in fileinput.input(clean_files):       
        outfile.write(lines)
        outfile.write("\n") 

# Lexicon Lookup

In [5]:
#create a curated dictionary from OCR output which is clean and without misspellings
def lexicon():
    dictionaryWords = []
    files = open('dictionary.txt', 'r', encoding = 'utf-8') # this is test dictionary file
    for line in files:
        word = line.lower().split()
        word = [words.strip(', .') for words in word]
        for words in word:
            dictionaryWords.append(words)
    files.close()
    return dictionaryWords

#read the noisy data
def noisy_data():
    tokens = []
    files = open('noisy_data.txt', 'r', encoding ='utf-8')
    for line in files:
        token_lower = line.lower().strip()
        tokenized_word = word_tokenize(token_lower)
        for words in tokenized_word:
            tokens.append(words.strip('\n\n~,.*;/\\(){}<>?@#$%^&_+--+:""'))
    files.close()
    return tokens

#lexicon lookup and check OCRed error tokens
def ocr_ERROR(lexicon_words, noisy_words):
    misspelled = []
    for words in noisy_words:
        if words not in lexicon_words:
            misspelled.append(words)
    return misspelled

def check_ERROR(error_check):
    for words in error_check:
        print(words)

# Regular Expressions

In [6]:
#spliting the text on delimeter
def split_regx(text):
    text = re.split('[\n\n,.]', text)
    return text

#pre-process the delimeted text
def list_to_string(list_str):
    str1 = " ".join(list_str)
    return str1

def process(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"!", "", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\]", "", string)
    string = re.sub(r"\[", "", string)
    string = re.sub(r"\}", "", string)
    string = re.sub(r"\{", "", string)
    string = re.sub(r"\*", "", string)
    string = re.sub(r"\;", "", string)
    string = re.sub(r"\~", "", string)
    string = re.sub(r"\@", "", string)
    string = re.sub(r"\&", "", string)
    string = re.sub(r"\%", "", string)
    string = re.sub(r"\#", "", string)
    string = re.sub(r"\^", "", string)
    string = re.sub(r"\+", "", string)
    string = re.sub(r"\=", "", string)
    string = re.sub(r"\_", "", string)
    string = re.sub(r"\>", "", string)
    string = re.sub(r"\<", "", string)
    string = re.sub(r"\$", "", string)
    string = re.sub(r"/", "", string)
    
    return string

In [7]:
def regex_err():
    in_file = open("noisy_data.txt", "r", encoding = "utf-8")
    tokens = []
    for lines in in_file:
        token = lines.lower()
        split_text = split_regx(token)
        listString = list_to_string(split_text)
        processed = process(listString)
        tokenized = word_tokenize(processed)
        for words in tokenized:
            tokens.append(words)
    in_file.close()
    return tokens

# Unigram Setting

In [8]:
def n_gram(noisy, clean):
    tokens1 = list(ngrams(noisy, 1))
    tokens2 = list(ngrams(clean, 1))
    uncommon = []
    for gram in tokens1:
        if gram not in tokens2:
            uncommon.append(gram) 
    return uncommon

In [9]:
if __name__ == "__main__":
    read_dict = lexicon()
    read_noisy = noisy_data()
    read_regex = regex_err()
    error_check = ocr_ERROR(read_dict, read_noisy)
    regex_check = ocr_ERROR(read_dict, read_regex)
    check_ngram = n_gram(read_noisy, read_dict)

In [10]:
print(error_check)

['seience', 'ee', 'departiiént', 'bagge', 'arc', 'hives', '7ee', 'sjb', 'ra', 'ries', 'ppccedupes', 'peppesfntaticn', 'fop', 'mata', 'ppcgpaim', 'fcr', 'undepstatipe', 'ic', 'matupal', 'hy', 'iinograd', 'suppittter', '¢', 'paptial', 'cf', 'pequepemfnts', 'fcp', 'thf', 'pfopee', 'poctopr', 'nf', 'imnstitutf', 'tfchnology', '12790', 'wun', 'soe', 'ee', 'ee', 'mat', 'feratic', 'tugust', '2h', 'woe', 'ee', 'ee', 'thests', 'hy', '78', '«', '°', '‘', "'", '«', '°', 'denartmental', 'corrittee', 'craduate', 'fece', '1279', 'ven', 'saies', 'cescscce', 'crn', 'cce', 'vases', 'eee', 'asm', 'ees', 'mee', 'earv', 'reser', 'eee', 'neers', 'ns', 'engipeering', 'lt', 'you', 'eee', 'steer', 'teeta', 'ees', 'dy', 'cc', 'cecvese', 'sree', 'ene', 'ge', 'veume', 'ter', 'ceve', 'seman', 'rseeess', 'een', 'ene', 'ewe', 'ereens', 'é', 'wo', 'ee', 'fo', 'oe', 'ert', 'ee', 'fj', 'te', 'hst', 'le', 'jerr', 'ries', 'queen', "'s", 'tees', 'cls', 'ctr', 'ance', 'ehocesascces', 'ee', 'eerweern', 'sone', '“', 'eevees

In [11]:
print(regex_check)

['seience', 'ee', 'departiiént', 'bagge', 'arc', 'hives', '7ee', 'sjb', 'ra', 'ries', 'ppccedupes', 'peppesfntaticn', 'fop', 'mata', 'ppcgpaim', 'fcr', 'undepstatipe', 'ic', 'matupal', 'hy', 'iinograd', 'suppittter', '¢n', 'paptial', 'cf', 'pequepemfnts', 'fcp', 'thf', 'pfopee', 'poctopr', 'nf', 'imnstitutf', 'tfchnology', '12790', 'wun', 'soe', 'ee', 'ee', 'mat', 'feratic', 'tugust', '2h', 'woe', 'ee', 'ee', 'thests', 'hy', '78', '«', '°', '‘', "'", '«', '°', 'denartmental', 'corrittee', 'craduate', 'fece', '1279', 'ven', 'saies', 'cescscce', 'crn', 'cce', 'vases', 'eee', 'asm', 'ees', 'mee', 'earv', 'reser', 'eee', 'neers', 'ns', 'engipeering', 'lt', 'you', 'eee', 'steer', 'teeta', 'ees', 'dy', 'cc', 'cecvese', 'sree', 'ene', 'ge', 'veume', 'ter', 'ceve', 'seman', 'rseeess', 'een', 'ene', 'ewe', 'ereens', 'é', 'wo', 'ee', 'fo', 'oe', 'ert', 'ee', 'fj', 'te', 'hst', 'le', 'jerr', 'ries', 'queen', "'s", 'tees', 'cls', 'ctr', 'ance', 'ehocesascces', 'ee', 'eerweern', 'sone', '“', 'eevee

In [12]:
print(check_ngram)

[('seience',), ('ee',), ('departiiént',), ('bagge',), ('arc',), ('hives',), ('7ee',), ('sjb',), ('ra',), ('ries',), ('ppccedupes',), ('peppesfntaticn',), ('fop',), ('mata',), ('ppcgpaim',), ('fcr',), ('undepstatipe',), ('ic',), ('matupal',), ('hy',), ('iinograd',), ('suppittter',), ('¢',), ('paptial',), ('cf',), ('pequepemfnts',), ('fcp',), ('thf',), ('pfopee',), ('poctopr',), ('nf',), ('imnstitutf',), ('tfchnology',), ('12790',), ('wun',), ('soe',), ('ee',), ('ee',), ('mat',), ('feratic',), ('tugust',), ('2h',), ('woe',), ('ee',), ('ee',), ('thests',), ('hy',), ('78',), ('«',), ('°',), ('‘',), ("'",), ('«',), ('°',), ('denartmental',), ('corrittee',), ('craduate',), ('fece',), ('1279',), ('ven',), ('saies',), ('cescscce',), ('crn',), ('cce',), ('vases',), ('eee',), ('asm',), ('ees',), ('mee',), ('earv',), ('reser',), ('eee',), ('neers',), ('ns',), ('engipeering',), ('lt',), ('you',), ('eee',), ('steer',), ('teeta',), ('ees',), ('dy',), ('cc',), ('cecvese',), ('sree',), ('ene',), ('ge'