Skip to content

Commit

Permalink
Pandas check; fixes #879 (#880)
Browse files Browse the repository at this point in the history
* Update backoff.py

Fix lemmatizer order

* Remove pandas depenency from Indian syllabifier

* Add delimiter to csv reader

* Replace pandas indexing with lists

* Fix indexing

* Update stem tests

* Keep integers in read csv

* Fix list comprehension

* Fixes #879; replaces pandas functions with csv and list comprehension
  • Loading branch information
diyclassics authored and kylepjohnson committed Mar 2, 2019
1 parent 4b6b6be commit 47f00b5
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 18 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Expand Up @@ -15,14 +15,13 @@ before_script:
- pip install --upgrade pip
- pip install codecov
- pip install coveralls
- pip install pandas # for the Indian syllabifier
- pip install greek-accentuation # for the phonetic transcriber
- pip install fuzzywuzzy
- pip install python-Levenshtein
- pip install gensim # for word2vec.py
- pip install Sphinx
- pip install numpy
- pip install scipy
- pip install scipy
- pip install scikit-learn

script:
Expand Down
31 changes: 23 additions & 8 deletions cltk/stem/sanskrit/indian_syllabifier.py
Expand Up @@ -7,12 +7,12 @@
"""

import os
import csv

try:
import numpy as np
import pandas as pd
except ImportError:
print('"pandas" and "numpy" libraries not installed.')
print('"numpy" is not installed.')
raise

__author__ = ['Anoop Kunchukuttan']
Expand Down Expand Up @@ -93,20 +93,34 @@ def get_lang_data(self):
csv_dir_path = os.path.join(root, 'cltk_data/sanskrit/model/sanskrit_models_cltk/phonetics')

all_phonetic_csv = os.path.join(csv_dir_path, 'all_script_phonetic_data.csv')
all_phonetic_data = pd.read_csv(all_phonetic_csv, encoding='utf-8')
tamil_csv = os.path.join(csv_dir_path, 'tamil_script_phonetic_data.csv')
tamil_phonetic_data = pd.read_csv(tamil_csv, encoding='utf-8')

all_phonetic_vectors = all_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values
tamil_phonetic_vectors = tamil_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values
# Make helper function for this
with open(all_phonetic_csv,'r') as f:
reader = csv.reader(f, delimiter = ',', quotechar = '"')
next(reader, None) # Skip headers
all_phonetic_data = [row for row in reader]

with open(tamil_csv,'r') as f:
reader = csv.reader(f, delimiter = ',', quotechar = '"')
next(reader, None) # Skip headers
# tamil_phonetic_data = [row[PHONETIC_VECTOR_START_OFFSET:] for row in reader]
tamil_phonetic_data = [row for row in reader]

# Handle better?
all_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in all_phonetic_data]
tamil_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in tamil_phonetic_data]

all_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in all_phonetic_data])
tamil_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in tamil_phonetic_data])

phonetic_vector_length = all_phonetic_vectors.shape[1]

return all_phonetic_data, tamil_phonetic_data, all_phonetic_vectors, tamil_phonetic_vectors, phonetic_vector_length

@staticmethod
def in_coordinated_range_offset(c_offset):
"""Applicable to Brahmi derived Indic scripts. Used to determine
"""Applicable to Brahmi derived Indic scripts. Used to determine
whether offset is of a alphabetic character or not.
"""
return COORDINATED_RANGE_START_INCLUSIVE <= c_offset <= COORDINATED_RANGE_END_INCLUSIVE
Expand Down Expand Up @@ -140,7 +154,8 @@ def get_phonetic_feature_vector(self, c, lang):

phonetic_data, phonetic_vectors = self.get_phonetic_info(lang)

if phonetic_data.ix[offset, 'Valid Vector Representation'] == 0:
# 'Valid Vector Representation' is the [5] column
if phonetic_data[offset][5] == 0:
return self.invalid_vector()

return phonetic_vectors[offset]
Expand Down
2 changes: 1 addition & 1 deletion cltk/tests/test_nlp/test_corpus.py
Expand Up @@ -122,7 +122,7 @@ def test_tlgu_convert(self):
"""Test TLGU convert. This reads the file
``tlgu_test_text_beta_code.txt``, which mimics a TLG file, and
converts it.
Note: assertEquals fails on some accented characters ('ή', 'ί').
Note: assertEqual fails on some accented characters ('ή', 'ί').
"""
in_test = os.path.abspath('cltk/tests/test_nlp/tlgu_test_text_beta_code.txt')
out_test = os.path.expanduser('~/cltk_data/tlgu_test_text_unicode.txt')
Expand Down
10 changes: 5 additions & 5 deletions cltk/tests/test_nlp/test_stem.py
Expand Up @@ -242,15 +242,15 @@ def test_akkadian_bound_form(self):
word = "awīlum"
bound_form = bound_former.get_bound_form(word, 'm')
target = "awīl"
self.assertEquals(bound_form, target)
self.assertEqual(bound_form, target)

def test_akkadian_cv_pattern(self):
"""Test Akkadian CV pattern method"""
cv_patterner = AkkadianCVPattern()
word = "iparras"
cv_pattern = cv_patterner.get_cv_pattern(word, pprint=True)
target = "V₁C₁V₂C₂C₂V₂C₃"
self.assertEquals(cv_pattern, target)
self.assertEqual(cv_pattern, target)

def test_akkadian_declension(self):
"""Test Akkadian noun declension"""
Expand All @@ -264,15 +264,15 @@ def test_akkadian_declension(self):
('iltān', {'case': 'nominative', 'number': 'dual'}),
('ilātim', {'case': 'oblique', 'number': 'plural'}),
('ilātum', {'case': 'nominative', 'number': 'plural'})]
self.assertEquals(sorted(declension), sorted(target))
self.assertEqual(sorted(declension), sorted(target))

def test_akkadian_stemmer(self):
"""Test Akkadian stemmer"""
stemmer = AkkadianStemmer()
word = "šarrū"
stem = stemmer.get_stem(word, 'm')
target = "šarr"
self.assertEquals(stem, target)
self.assertEqual(stem, target)

def test_akkadian_syllabifier(self):
"""Test Akkadian syllabifier"""
Expand Down Expand Up @@ -549,7 +549,7 @@ def french_stemmer_test(self):
target = "j depart a it quant par la vil v err tut a cheval un pucel en tut le siecl n' o si bel un blanc palefre" \
" chevalcho "
self.assertEqual(stemmed_text, target)

def test_middle_english_stemmer(self):
sentence = ['the', 'speke', 'the', 'henmest', 'kyng', 'in', 'the', 'hillis', 'he', 'beholdis','he', 'lokis', 'vnder',
'his', 'hondis', 'and', 'his', 'hed', 'heldis']
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Expand Up @@ -84,6 +84,4 @@ nose==1.3.7
scikit-learn==0.19.2
fuzzywuzzy==0.17.0
python-Levenshtein==0.12.0
pandas==0.23.4
greek-accentuation==1.2.0

0 comments on commit 47f00b5

Please sign in to comment.