diff --git a/.travis.yml b/.travis.yml index 62f6677db..aa174825b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,14 +15,13 @@ before_script: - pip install --upgrade pip - pip install codecov - pip install coveralls -- pip install pandas # for the Indian syllabifier - pip install greek-accentuation # for the phonetic transcriber - pip install fuzzywuzzy - pip install python-Levenshtein - pip install gensim # for word2vec.py - pip install Sphinx - pip install numpy -- pip install scipy +- pip install scipy - pip install scikit-learn script: diff --git a/cltk/stem/sanskrit/indian_syllabifier.py b/cltk/stem/sanskrit/indian_syllabifier.py index b3338e3a5..7e7de417e 100644 --- a/cltk/stem/sanskrit/indian_syllabifier.py +++ b/cltk/stem/sanskrit/indian_syllabifier.py @@ -7,12 +7,12 @@ """ import os +import csv try: import numpy as np - import pandas as pd except ImportError: - print('"pandas" and "numpy" libraries not installed.') + print('"numpy" is not installed.') raise __author__ = ['Anoop Kunchukuttan'] @@ -93,12 +93,26 @@ def get_lang_data(self): csv_dir_path = os.path.join(root, 'cltk_data/sanskrit/model/sanskrit_models_cltk/phonetics') all_phonetic_csv = os.path.join(csv_dir_path, 'all_script_phonetic_data.csv') - all_phonetic_data = pd.read_csv(all_phonetic_csv, encoding='utf-8') tamil_csv = os.path.join(csv_dir_path, 'tamil_script_phonetic_data.csv') - tamil_phonetic_data = pd.read_csv(tamil_csv, encoding='utf-8') - all_phonetic_vectors = all_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values - tamil_phonetic_vectors = tamil_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values + # Make helper function for this + with open(all_phonetic_csv,'r') as f: + reader = csv.reader(f, delimiter = ',', quotechar = '"') + next(reader, None) # Skip headers + all_phonetic_data = [row for row in reader] + + with open(tamil_csv,'r') as f: + reader = csv.reader(f, delimiter = ',', quotechar = '"') + next(reader, None) # Skip headers + # tamil_phonetic_data = [row[PHONETIC_VECTOR_START_OFFSET:] for row in reader] + tamil_phonetic_data = [row for row in reader] + + # Handle better? + all_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in all_phonetic_data] + tamil_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in tamil_phonetic_data] + + all_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in all_phonetic_data]) + tamil_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in tamil_phonetic_data]) phonetic_vector_length = all_phonetic_vectors.shape[1] @@ -106,7 +120,7 @@ def get_lang_data(self): @staticmethod def in_coordinated_range_offset(c_offset): - """Applicable to Brahmi derived Indic scripts. Used to determine + """Applicable to Brahmi derived Indic scripts. Used to determine whether offset is of a alphabetic character or not. """ return COORDINATED_RANGE_START_INCLUSIVE <= c_offset <= COORDINATED_RANGE_END_INCLUSIVE @@ -140,7 +154,8 @@ def get_phonetic_feature_vector(self, c, lang): phonetic_data, phonetic_vectors = self.get_phonetic_info(lang) - if phonetic_data.ix[offset, 'Valid Vector Representation'] == 0: + # 'Valid Vector Representation' is the [5] column + if phonetic_data[offset][5] == 0: return self.invalid_vector() return phonetic_vectors[offset] diff --git a/cltk/tests/test_nlp/test_corpus.py b/cltk/tests/test_nlp/test_corpus.py index 4b64e91b5..02b17e740 100644 --- a/cltk/tests/test_nlp/test_corpus.py +++ b/cltk/tests/test_nlp/test_corpus.py @@ -122,7 +122,7 @@ def test_tlgu_convert(self): """Test TLGU convert. This reads the file ``tlgu_test_text_beta_code.txt``, which mimics a TLG file, and converts it. - Note: assertEquals fails on some accented characters ('ή', 'ί'). + Note: assertEqual fails on some accented characters ('ή', 'ί'). """ in_test = os.path.abspath('cltk/tests/test_nlp/tlgu_test_text_beta_code.txt') out_test = os.path.expanduser('~/cltk_data/tlgu_test_text_unicode.txt') diff --git a/cltk/tests/test_nlp/test_stem.py b/cltk/tests/test_nlp/test_stem.py index 68adae073..82db6e0f2 100644 --- a/cltk/tests/test_nlp/test_stem.py +++ b/cltk/tests/test_nlp/test_stem.py @@ -242,7 +242,7 @@ def test_akkadian_bound_form(self): word = "awīlum" bound_form = bound_former.get_bound_form(word, 'm') target = "awīl" - self.assertEquals(bound_form, target) + self.assertEqual(bound_form, target) def test_akkadian_cv_pattern(self): """Test Akkadian CV pattern method""" @@ -250,7 +250,7 @@ def test_akkadian_cv_pattern(self): word = "iparras" cv_pattern = cv_patterner.get_cv_pattern(word, pprint=True) target = "V₁C₁V₂C₂C₂V₂C₃" - self.assertEquals(cv_pattern, target) + self.assertEqual(cv_pattern, target) def test_akkadian_declension(self): """Test Akkadian noun declension""" @@ -264,7 +264,7 @@ def test_akkadian_declension(self): ('iltān', {'case': 'nominative', 'number': 'dual'}), ('ilātim', {'case': 'oblique', 'number': 'plural'}), ('ilātum', {'case': 'nominative', 'number': 'plural'})] - self.assertEquals(sorted(declension), sorted(target)) + self.assertEqual(sorted(declension), sorted(target)) def test_akkadian_stemmer(self): """Test Akkadian stemmer""" @@ -272,7 +272,7 @@ def test_akkadian_stemmer(self): word = "šarrū" stem = stemmer.get_stem(word, 'm') target = "šarr" - self.assertEquals(stem, target) + self.assertEqual(stem, target) def test_akkadian_syllabifier(self): """Test Akkadian syllabifier""" @@ -549,7 +549,7 @@ def french_stemmer_test(self): target = "j depart a it quant par la vil v err tut a cheval un pucel en tut le siecl n' o si bel un blanc palefre" \ " chevalcho " self.assertEqual(stemmed_text, target) - + def test_middle_english_stemmer(self): sentence = ['the', 'speke', 'the', 'henmest', 'kyng', 'in', 'the', 'hillis', 'he', 'beholdis','he', 'lokis', 'vnder', 'his', 'hondis', 'and', 'his', 'hed', 'heldis'] diff --git a/requirements.txt b/requirements.txt index b0e08c734..49198f570 100644 --- a/requirements.txt +++ b/requirements.txt @@ -84,6 +84,4 @@ nose==1.3.7 scikit-learn==0.19.2 fuzzywuzzy==0.17.0 python-Levenshtein==0.12.0 -pandas==0.23.4 greek-accentuation==1.2.0 -