Pandas check; fixes #879 (#880)

* Update backoff.py Fix lemmatizer order * Remove pandas depenency from Indian syllabifier * Add delimiter to csv reader * Replace pandas indexing with lists * Fix indexing * Update stem tests * Keep integers in read csv * Fix list comprehension * Fixes #879; replaces pandas functions with csv and list comprehension
cltk · Mar 2, 2019 · 47f00b5 · 47f00b5
1 parent 4b6b6be
commit 47f00b5
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 18 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -15,14 +15,13 @@ before_script:
 - pip install --upgrade pip
 - pip install codecov
 - pip install coveralls
-- pip install pandas  # for the Indian syllabifier
 - pip install greek-accentuation  # for the phonetic transcriber
 - pip install fuzzywuzzy
 - pip install python-Levenshtein
 - pip install gensim  # for word2vec.py
 - pip install Sphinx
 - pip install numpy
-- pip install scipy 
+- pip install scipy
 - pip install scikit-learn
 
 script:

diff --git a/cltk/stem/sanskrit/indian_syllabifier.py b/cltk/stem/sanskrit/indian_syllabifier.py
@@ -7,12 +7,12 @@
 """
 
 import os
+import csv
 
 try:
     import numpy as np
-    import pandas as pd
 except ImportError:
-    print('"pandas" and "numpy" libraries not installed.')
+    print('"numpy" is not installed.')
     raise
 
 __author__ = ['Anoop Kunchukuttan']
@@ -93,20 +93,34 @@ def get_lang_data(self):
         csv_dir_path = os.path.join(root, 'cltk_data/sanskrit/model/sanskrit_models_cltk/phonetics')
 
         all_phonetic_csv = os.path.join(csv_dir_path, 'all_script_phonetic_data.csv')
-        all_phonetic_data = pd.read_csv(all_phonetic_csv, encoding='utf-8')
         tamil_csv = os.path.join(csv_dir_path, 'tamil_script_phonetic_data.csv')
-        tamil_phonetic_data = pd.read_csv(tamil_csv, encoding='utf-8')
 
-        all_phonetic_vectors = all_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values
-        tamil_phonetic_vectors = tamil_phonetic_data.ix[:, PHONETIC_VECTOR_START_OFFSET:].values
+        # Make helper function for this
+        with open(all_phonetic_csv,'r') as f:
+            reader = csv.reader(f, delimiter = ',', quotechar = '"')
+            next(reader, None) # Skip headers
+            all_phonetic_data = [row for row in reader]
+
+        with open(tamil_csv,'r') as f:
+            reader = csv.reader(f, delimiter = ',', quotechar = '"')
+            next(reader, None) # Skip headers
+            # tamil_phonetic_data = [row[PHONETIC_VECTOR_START_OFFSET:] for row in reader]
+            tamil_phonetic_data = [row for row in reader]
+
+        # Handle better?
+        all_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in all_phonetic_data]
+        tamil_phonetic_data = [[int(cell) if cell=='0' or cell=='1' else cell for cell in row] for row in tamil_phonetic_data]
+
+        all_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in all_phonetic_data])
+        tamil_phonetic_vectors = np.array([row[PHONETIC_VECTOR_START_OFFSET:] for row in tamil_phonetic_data])
 
         phonetic_vector_length = all_phonetic_vectors.shape[1]
 
         return all_phonetic_data, tamil_phonetic_data, all_phonetic_vectors, tamil_phonetic_vectors, phonetic_vector_length
 
     @staticmethod
     def in_coordinated_range_offset(c_offset):
-        """Applicable to Brahmi derived Indic scripts. Used to determine 
+        """Applicable to Brahmi derived Indic scripts. Used to determine
         whether offset is of a  alphabetic character or not.
         """
         return COORDINATED_RANGE_START_INCLUSIVE <= c_offset <= COORDINATED_RANGE_END_INCLUSIVE
@@ -140,7 +154,8 @@ def get_phonetic_feature_vector(self, c, lang):
 
         phonetic_data, phonetic_vectors = self.get_phonetic_info(lang)
 
-        if phonetic_data.ix[offset, 'Valid Vector Representation'] == 0:
+        # 'Valid Vector Representation' is the [5] column
+        if phonetic_data[offset][5] == 0:
             return self.invalid_vector()
 
         return phonetic_vectors[offset]

diff --git a/cltk/tests/test_nlp/test_corpus.py b/cltk/tests/test_nlp/test_corpus.py
@@ -122,7 +122,7 @@ def test_tlgu_convert(self):
         """Test TLGU convert. This reads the file
         ``tlgu_test_text_beta_code.txt``, which mimics a TLG file, and
         converts it.
-        Note: assertEquals fails on some accented characters ('ή', 'ί').
+        Note: assertEqual fails on some accented characters ('ή', 'ί').
         """
         in_test = os.path.abspath('cltk/tests/test_nlp/tlgu_test_text_beta_code.txt')
         out_test = os.path.expanduser('~/cltk_data/tlgu_test_text_unicode.txt')

diff --git a/cltk/tests/test_nlp/test_stem.py b/cltk/tests/test_nlp/test_stem.py
@@ -242,15 +242,15 @@ def test_akkadian_bound_form(self):
         word = "awīlum"
         bound_form = bound_former.get_bound_form(word, 'm')
         target = "awīl"
-        self.assertEquals(bound_form, target)
+        self.assertEqual(bound_form, target)
 
     def test_akkadian_cv_pattern(self):
         """Test Akkadian CV pattern method"""
         cv_patterner = AkkadianCVPattern()
         word = "iparras"
         cv_pattern = cv_patterner.get_cv_pattern(word, pprint=True)
         target = "V₁C₁V₂C₂C₂V₂C₃"
-        self.assertEquals(cv_pattern, target)
+        self.assertEqual(cv_pattern, target)
 
     def test_akkadian_declension(self):
         """Test Akkadian noun declension"""
@@ -264,15 +264,15 @@ def test_akkadian_declension(self):
                   ('iltān', {'case': 'nominative', 'number': 'dual'}),
                   ('ilātim', {'case': 'oblique', 'number': 'plural'}),
                   ('ilātum', {'case': 'nominative', 'number': 'plural'})]
-        self.assertEquals(sorted(declension), sorted(target))
+        self.assertEqual(sorted(declension), sorted(target))
 
     def test_akkadian_stemmer(self):
         """Test Akkadian stemmer"""
         stemmer = AkkadianStemmer()
         word = "šarrū"
         stem = stemmer.get_stem(word, 'm')
         target = "šarr"
-        self.assertEquals(stem, target)
+        self.assertEqual(stem, target)
 
     def test_akkadian_syllabifier(self):
         """Test Akkadian syllabifier"""
@@ -549,7 +549,7 @@ def french_stemmer_test(self):
         target = "j depart a it quant par la vil v err tut a cheval un pucel en tut le siecl n' o si bel un blanc palefre" \
                     " chevalcho "
         self.assertEqual(stemmed_text, target)
-    
+
     def test_middle_english_stemmer(self):
         sentence = ['the', 'speke', 'the', 'henmest', 'kyng', 'in', 'the', 'hillis', 'he', 'beholdis','he', 'lokis', 'vnder',
                     'his', 'hondis', 'and', 'his', 'hed', 'heldis']

diff --git a/requirements.txt b/requirements.txt
@@ -84,6 +84,4 @@ nose==1.3.7
 scikit-learn==0.19.2
 fuzzywuzzy==0.17.0
 python-Levenshtein==0.12.0
-pandas==0.23.4
 greek-accentuation==1.2.0
-