In [1]:
import string
import numpy as np
import re

In [28]:
class TextCleaner():
    """
    Thai text cleaner
    """
    def __init__(self):        
        #https://omniglot.com/writing/atikamekw.htm
        self.charchters = ['p', 't','k','s','c','tc', 'm','n','r','h', 'w','a','e','i','o']
        self.punctuation = string.punctuation + '،' + '؛' + '؟' + '؛' + '۔' + '»' + '«' + '-'
        self.one_space_regex = r"\s((\s)(\s+)?)?"
        #self.multiple_space_regex = r"\s+"
        self.text = None

        self.dict_punct = dict(zip(list(self.punctuation), np.repeat(' ', len(self.punctuation))))
        
    def remove_punct(self, text):
        table = str.maketrans(self.dict_punct)
        text = text.translate(table)
        return text
    
    def remove_num(self, text):
        num_pattern = r'[\u06F0-\u06F9]'
        text = re.sub(num_pattern, ' ', text)
        return text
    
    def remove_spaces(self, text):
        try:
            #remove all whitespaces since spaces are considered as a sentence seperator in thai
            text = re.sub(self.one_space_regex, ' ', text)
            text = text if text[0] != ' ' else text[1:]
            text = text if text[-1] != ' ' else text[:-1]
            return text
        
        except IndexError as e:
            return ''
    
    def is_not_fa_token(self, token):
        for ch in set(token.lower()):
            #if ord(ch) < int(self.min_ascii, 16) or ord(ch) > int(self.max_ascii, 16):
            if ch not in self.charchters:
                return False
        return True

    def remove_foreign_lang(self, text):
        clean_text = ''
        for token in text.split():
            if self.is_not_fa_token(token):
                clean_text += ' ' + token
        return clean_text[1:]
    
    def clean_text(self, text):
        text = text.lower()
        text = self.remove_punct(text)
        text = self.remove_num(text)
        text = self.remove_foreign_lang(text)
        text = self.remove_spaces(text)
        return text

In [29]:
# Open a file: file
file = open('Pikokw.txt',mode='r')
 
# read all lines at once
all_of_it = file.read()
 
# close the file
file.close()

In [30]:
all_of_it

'Enko anihe atoske irapatcikan pikokw ka icinikatek. Napitc kinoskon kaie arimatc apatisiw neta nikanik anaha atos ka icinikasotc kitci ki nipahatc awesisa mekwatc ka atoskewaketc. Matci oskan kaie asini apatisiw kitci ocihakoniwitc anaha atos ka icinikasotc, arimatc kinikosiw kirika kinisiw ote nikanik. Mictikw apatan e sokaskok aka kitci natowaparik kaie kitci mirokotek. Enko tca arimatc atcapi ka apatisitc kitci matcekotcik pikokw. Nicawek icinakoniw anaha pikokw ote nikanik acitc otananik. Ote nikanik ekotc e apitc anaha atos mina ote otananik atcinikan icinikatew.\n\n'

In [31]:
len(all_of_it)

576

In [32]:
cleaner = TextCleaner()

In [33]:
cleaned = cleaner.clean_text(all_of_it)

In [34]:
len(cleaned)

566

In [35]:
cleaned

'enko anihe atoske irapatcikan pikokw ka icinikatek napitc kinoskon kaie arimatc apatisiw neta nikanik anaha atos ka icinikasotc kitci ki nipahatc awesisa mekwatc ka atoskewaketc matci oskan kaie asini apatisiw kitci ocihakoniwitc anaha atos ka icinikasotc arimatc kinikosiw kirika kinisiw ote nikanik mictikw apatan e sokaskok aka kitci natowaparik kaie kitci mirokotek enko tca arimatc atcapi ka apatisitc kitci matcekotcik pikokw nicawek icinakoniw anaha pikokw ote nikanik acitc otananik ote nikanik ekotc e apitc anaha atos mina ote otananik atcinikan icinikatew'

In [37]:
print ('Enko'[:-1])

Enk


In [None]:
class Atikamekw_basic_lemma():
    self.words = {}
    self.verbs = {}
    self.processed_text = ''
    
    
    # nouns
    
    #https://www.langueatikamekw.ca/grammaire/noms/locatif/
    def replace_locatives(self, text):
        for token in text.split():
            if token[-2] == 'ok':
                self.words[token]['root'] = token[:-2] + 'w'
                self.words[token]['POS'] = 'LN'
                #self.words[token]['type'] = 'N'
            elif token[:-1] == 'k':
                self.words[token]['root'] = token[:-1]
                self.words[token]['POS'] = 'LN'
                #self.words[token]['type'] = 'N'
    
    
    #https://www.langueatikamekw.ca/grammaire/noms/le-nombre-des-noms-singulier-pluriel/
    def remove_plurals(self, text):
        for token in text.split():
            if token[-2] == 'ok':
                self.words[token]['root'] = token[:-2] + 'w'
                self.words[token]['POS'] = 'NAP'
                #self.words[token]['type'] = 'N'
            elif token[-2] == 'ak':
                self.words[token]['root'] = token[:-2]
                self.words[token]['POS'] = 'NAP'
            elif token[-1] == 'a':
                self.words[token]['root'] = token[:-1]
                self.words[token]['POS'] = 'NIP'
                
    
    # possessive Pronouns
    #befor w or constant 
    
    #mine , yours , he's
    possisive_pronouns_prefexis = ['ki','ni','o']
    # befor o 
    possisive_pronouns_prefexis = ['k','n','o']
    # before vowel
    possisive_pronouns_prefexis = ['kit','nit','ot']
    
    # notre mine and hes = ni possisive
    ni + word + in/an
    # notre notre  mine and yours 
    ki + word + in/o
    # your (plural) 
    ki + word + iwaw
    #thier 
    o + word + iwaw
    o+ word + iwa 
    o + word + aw
    # his other froms 
    o + word + iriw
    
    # someone propoerty  indefinite format 
    o + word + nan or awik
    
    #surboviatif 
    
    #possisive comes befor the plural
    
                
    stop-words = ['kotahik','awihik','kotak','awik', 'wirawaw','wir','ninan',
                  'kirano','kirawaw','nin','kir' , 'kekwan']
    ask_question_stop_words =['awin','awiritake','kekwan' ,'tan' ]
    presentative_stop_words = ['enko','namaiew']
    alternative_stop_words = ['kotak']
    demonstrative_stop_words =['nahwe','ohki' , 'nohwe', 'niheriw','neheriw', 'neriw']
    
    
    #verbs

In [None]:
# nouns to indicate if the profession is male or female
male_female_words_of_profession = ['iskwew' # female
 ,'iriniw' #male
, 'irinikwew' #male or female 
]

#verbs to indicate  profession
male_female_verbs_of_profession = [ 'iriniwin','iskwewin','riniskwewin']

# indicate the genere of an animal 

male_Female_particular_animals = ['noce' #female
                                 ,'nape' # male
                                 ,]

#NOUNS

# they come by this order po -> p-> di -> loc


possisive_pref = ['kit','nit','ot' , 'ki','ni','o', 'k','n','o']

possisive_suff = ['inan' , 'ino' ,  'iwaw' , 'iwa' ,'aw','iriw','nan', 'awik' , 'im']



plural_suff = ['a','ak','ok>w']

diminutif_suff = ['icic', 'ocic']
#diminutif_suff = ['ic', 'cic']

locative_suff = ['ik', 'ok>w']

#those last two we can add together
obviativ_suff = ['iw','riw','a']

surobviatif = ['iriw','riw']


derivation_suffix = ['an','ekin','apo','api','apiskw']

#PRONOUNS
 
perosonal_pron = ['nin' , 'kir' , 'wir']
indefinant_pron = ['awik' , 'kekwan' , 'kotak']
interogatice_pron = ['awin' , 'kekwan' , 'tan' , 'tanta', 'tante','tan apitc']

personal_pron_prefixes = ['ni','ki','nit','kit']
personal_pron_independant = ['nin','kir','wir','ninan','kirano','kirawaw','wirawaw']
personal_pron_independant_priority = ['ninctam','kirctam','wirctam','nirctaminan','kirctamiwaw','wirctamiwaw']

#
demostrative_words = ['nahwe', 'ohwa','ohwe','oma','nahwe','anahwe'
                     ,'naha','anaha','na','ohki','niki','neki'
                     ,'nihe','ohwi','nehe','anehe','nihi','neta','nete']

#time
#repetation
#comparing
#connectors
#subbordinate + verb
#expressing countite
#Modality of action (way of doing)
#proximity
#spacial-oriantation
#mark of interrogation
#intergection
particules = ['Aptic','minawatc','mocak','nama wiskat','nac','kinowec','ko' 
             ,'kiapatc','kiapatci','kiapatc peikwa','koski','oscamec','awocamec',
             'kekat','wiec','kirowe','nota','orina','tapicoktc','patok','mia','towi',
             'acitc','kirica','aima','aric','kaie',
             'e','ka','epwamoci','esko','ickwa','kitci','nota','wetci',
             'notc','memantcic','micta','tipi','orina','tepirak',
             'tekaci','pekatc','mamar'
             ,'warowik','pecotcik','ota','ote','neta','nete','nte',
             'icpimik','notc','opimera','okitc','cipa','pitc','nikanik','otananik','orowitimik','pitakamik',
             'la','a','aia',
             'ekocka','icine']

# 'kecpin' == if 
# 'kitci' == for
# 'e' == that
# 'wetci' == for
#' tan' == what
#'tan apitc' == when
#'kekwan wetci' = why
# 'aka' == negatuve
# 'ka' == relative

preverbes_for_immidiatly = ['ta','wi','ka','ki']
#VERBS

#verbs_suff = ['an','in', 'w','n','ano','awaw','ok']
# verbs_suff_AI_independent = ['n','nan', 'nano','nawaw','wok'] ->['w']
# verbs_suff_TI_independent = ['en','am','an','ano','awaw','ok'] -> ['am']
# verb_suff_AI_conjunctive = ['n','ian','an','in','iin','tc','ak','iak','iakw','kw','ikw','iekw','ekw','tcik'] ->['w']
# verb_suff_AI_imperative = ['kw','an','kan','kekw','tan'] ->['w']
# verb_suf_TI_imperative = ['amokw','eta','ekan','amokan','ameta'] -> ['am']
# verb_suf_TA_imperative = ['ici','inan','icikw','icinan','icikan','icikanan','icikekw','icikanan'] -> ['ew']



lexical_final = ['ote','ska','kowi','h','aso','acte','ckow','pw']
abstract_final = ['aia','cin','t','n','r','h']

doutfull_suffix = ['take']
subjonctive_suffix = ['e']

imperfect_suffix = ['tai','pan']
past_suffix = 'ki'
future_after_verb = ['ka','kata','ickwa']

preform_lexical = ['miro','kice','matci','micta','mata','kakike']

# reflexiv oneself ['itiso']
# recepcioncal each other ['ito']

#derivation 









VII: verb inanime intransitive

VAI  : anime intransitive

VTA : verb transitive anime

VTI : verb transitive inanime

VTI2 : verb transitive inanime 

#########


01 independant indicatif present

03 independant indicatif impeerfect

09 independant dubitatif present

10 independant dubitatif past

11a conjonctive indicatif present

12a conjonctive subjonctif

12b conjonctive iteratif 

13 conjonctive imperfrect

14 concojonctive dubitatif

15 conjonctive dubitatifg imperfect

17a imperfect indicative present

17b imperfect indicative future

In [2]:
vai_01 = ['n','nawaw','nano','nan','wok','riw','riwa']
vai_03 = ['tai','tawaw','tanano','tai','tan','pan','panak','ripan']
vai_09 = ['natake' , 'nawatake','nanotake','natake','nanatake','take','takenak','ritake','ritakena']
vai_10 = ['nakopan','nawakopan','nanokopan','nanakopan','kopan','wakopan','rikopan','rikopana']
vai_11 = ['in','iekw','ikw','ian','ika','tc','tcik','ritci','ritci']
vai_12a = ['ine','iekwe','ikwe','iane','iake','te','wate','rite','ritena']
#vai_12b = vai_12a
vai_13 =['ipan','iekopan','ikopan','iapan','iakipan','span','waspan','rispan','rispana']
vai_14 = ['wonen','wewokwen','wokwen','wanen','waken','kwen','wekwen','rikwen','rikwena']
vai_15 = ['wopanen','wekopanen','wokopanen','wapanen','wakipanen','kopanen','wakopanen','rikopanen','rikopanena']
vai_17a = ['kw','tan']
vai_17b = ['kan','kekw','tan']

vii_01 = ['o','iw','iwa','a','riw','riwa']
vii_03 = ['pan','pana','ripan','ripana']
vii_09 = ['take','takena','ritake','ritakena']
vii_10 = ['kopan','kopana','rikopan','rikopana']
vi_11 = ['k','ki','rik','riki']
vi_12a = ['ke','kawe','rike','rikawe']
#vi_12b = vi_12a
vi_13 = ['kipan','kipana','rikipan','rikipana']
vi_14 = ['kwen','kwena','rikwen','rikwena','ikwen','ikwena']
vi_15 = ['kopanen','kopanena','rikopanen']


vta_01 = ['in','itin','inan','itinan','inawaw','itinawaw',
         'imawaw','imikowaw','anano','ikonano','ananowok', 'ikonanowok', 'imananowa', 'imikonano'
         , 'aw','ikw','awok','ikok','imawa', 'imikw'
         , 'anan' , 'ikonan' ,'ananak', 'ikonanak','imanana','imikonan'
         , 'awaw' , 'ikowaw','awawok','ikowawok'
         ,'ew','ewok','imew','imewok','imeriw','iko','ikowok','eriw','eriwa','ikoriw','ikoriwa']
vta_03 = ['itai', 'ititai','itan', 'itinan','itawaw', 'itinaw','atanano','ikotanano','atananowok','ikotananowok'
         'atai', 'ikotai','ataiik','ikoktaiik','atan','ikotan','atananak','ikotananak'
         'atawaw','ikotawaw','atawawok','ikotawawok','epan','epanak','ikopan','ikowapanak'
         ,'eripan','eripana','ikoripan','ikor']
vta_09 = ['inatake' , 'inanatake', 'inawatake', 'itinatake','itinanatake'
         ,'itinawatake','ananotake','ananotakenak','ikonanotake','ikonanotakenak'
         ,'atake', 'atakenak' , 'ikonatakenak', 'ananatake','ananatakenak'
         ,'ikonanatakenak','awatake','awatakenak','ikotake','ikowatake','ikotakenak',
         'ikowatakenak','etake','etakena','ikotakena','eritake','eritakena','ikoritake','ikoritakena']
vta_10 = ['inakopan', 'inanakopan','inawakopan','itinakopan','itinanakopan'
         ,'itinawakopan','ananokopan','ananokopanak','ikonanokopan','ikonanokopanak'
         ,'akopan','akopanak','ikokopan','ikokopanak','akopan','ananakopank','ikonanakopan'
         ,'ananakopanak','ikonanakopanak','awakopan','awakopanak','ikokopan'
         ,'ikokopanak','ikowakopan','ekopan','ekopana','ikokopan','ikokopan','ikowakopan',
         'ikowakopana','erikopan','erikopana','ikorikopan','ikorikopana']

vta_11 = ['iin','iak','iekw','itan','itak','itakok','imeko',
         'okw','okok','imoko','imitako','itokw','itokok','imitoko'
         ,'ak','akik','imaki','itc','itcik','imitci','akitc','imitc'
         ,'akitcik','imitcik','imakitci','imtci','atc','iskw',
         'atcik','imatci','iskik','imiski','itakw','itakok','ekw','ekok',
         'atc','atcik','imatc','imatcik','imaritci','eritci','ewaritci',
         'ikoritci','ikowaritci','kotc','ikotcik']

vta_12 = ['iine','itaane','iake','iekwe','itake','itakokwe',
         'okwe','okwawe','itokwe','itokwawe','amake','akawe',
         'ikote','ikowate','imite','imitawe','ate','akite','atawe','iske','iskawe'
         ,'ewekwe','etawe','itakwe','itakawe','amate','awate','ikote','ikowate',
         'arite','ariwate','ariwatena','ikorite','ikoriwate']
vta_13 = ['ipan','iekopan','iakipan','iekopan','itapan','itakipan','itakokipan'
         , 'itakipan','okopan','okwapan','itakopan','itokwapan','akipan','akwapan','ispan',
         'iwaspan','akitipan','akitwapan','atipan','atwapan','ewekopan','ewekwapan','aspan',
         'aspana','awaspan','awaspana','arispan','arispana','imitipan','imitwapan','iskipan',
         'iskwapan','itakopan','itakwapan','ikospan','ikowaspana',
         'ikorispan','ikorwiwarispan']
vta_14 = ['iwonen','iwakwen','iwekwen','itawonen','itakokwen','owokwen', 'owokwenak'
         ,'awoken','awokenak','awakiten','awakitenak','awoten','awotenak','ewekwen','ewekwenak',
         'akwen','awakwen','itokwen','itokwenak','ikwen','iwakwen','itkowen','itokwenak',
         'iskwen','iskwaken','itakwen','itawenak','ikokwen','ikowakwen',
         'ikowakwen','arikwen','arikwena','ikorikwen','ikorikwena']

vta_15 = ['iwopanen','iakopanen','iekopanen','itawopanen','itakopanen',
         'owokopanen','awokopanenak','awokipanen','awokipanenak','awokitipanen',
         'awokitipanenak','awotipanen','awotipanenak','ewekopanen','ewekopanenak',
         'ikokopanenak','ikokopanen','iskwakopanen','iskopanen','iamitokopanenak','iamitokopanen',
         'iwakopanen','ikopanen','itokokopanenak','itokokopanen']

vta_17_a = ['ici','icinan','icikw','imik','imakw','atan','atanak','imatan','i','ik','im','akw','akok']
vta_17_b  = ['icikan','icikanan','icikekw','imakanak','imakan','akan','akanak','imakan','akekw','akekok']


vti_01 = ['en','enawaw','enano','enan','am','amok','amiriw','amiriwa','w','wa','riw','riwawa']
vti_03 = ['etai','etawaw','etano','etai','etan','amopan','amopank','amiripan','amiripana','pan','pana','ripan','ripana']
vti_09 = ['enatake','enawatake','enanotake','amotake','amotakenak','amiritake',
         'amiritakena','take','takena','ritakena','ritakena']
vti_10 = ['enakopan','enawakopan','enanokopan','enanakopan','amokopan','amowakopan','amirikopan','amirikopana']
vti_11 = ['aman','amekw','amokw','amak','ak','akik','amiritci','k','ki','riki']
vti_12a = ['amane','amekwe','amiekwe','amokwe','amoikwe','amane','amake','amane','amote','amowate','amirite','amiritena']
#vti_12b = vit_12a
vti_13 = ['amopan','amekopan','amokopan','amapan','amakipan','akipan','akwapan',
          'amirispan', 'amirispana']
vti_14 = ['amowonen','amwewokwen','amowokwen','amowaken','amokwen','amowakwen','amirikwen','amirikwena']
vti_15 = ['amowopanen','amokwekopanen','amokopanen','amowapanen','amowakipanen','amokopanen','amowkopanen','amirikopanen','amirikopanena']
vti_17a = ['a','amokw','eta']
vti_17b = ['ekan','amokan']



vti2_01 = ['n','nawaw','nano','nan','w','wok','riw','riwa','o','iw','iwa']
vti2_03 = ['tai','tawaw','tanano','tai','tan','pan','panak','ripan','ripana','pan','ipana','opan','opana']
vti2_09 = ['natake','nawatake','nanotake',
          'nanatake','take','takenak','ritake',
         'ritakena','otake','otakena','oritakena','oritakena']
vti2_10 = ['nakopan','nawakopan','nanokopan','nakopan','kopan','wakopan','rikopan','rikopana']
vti2_11 = ['in','iekw','ikw','ian','iak','tc','tcik','ritci','ki','niki','nik']
vti2_12a = ['ine','iekwe','ikwe','iane','iake','te','wate','rite','ritena']
#vti_12b = vit_12a
vti_13 = ['ipan','iekopan','ikopan','iapan','iakipan','span','waspan',
          'rispan', 'rispana']
vti_14 = ['wonen','wewokwen','wokwen','wanen','waken','kwen','wakwen','rikwena','rikwena']
vti_15 = ['wopanen','wekopanen','wokopanen','wapanen','wakipanen','kopanen','wakopanen','rikopanen','rikopanena']
vti_17a = ['kw','tan']
vti_17b = ['kan','kekw','nano']

!pip install multilingual-pdf2text

In [44]:
from multilingual_pdf2text.pdf2text import PDF2Text
from multilingual_pdf2text.models.docum
ent_model.document import Document
import logging
logging.basicConfig(level=logging.INFO)

pdf_document = Document(
        document_path="eScholarship UC item 7fk44815.pdf",
        language='eng'
        )
pdf2text = PDF2Text(document=pdf_document)
content = pdf2text.extract()
with open('output_file.txt', 'w', encoding='utf-8') as f:
    f.write(f"{content}\n")


INFO:multilingual_pdf2text.doc2img.parse_document:Parsing document from pdf to image
INFO:multilingual_pdf2text.doc2img.parse_document:Unable to get page count. Is poppler installed and in PATH?
INFO:multilingual_pdf2text.ocr.image_to_text:Extracting text from images via OCR
