In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

In [2]:
from funcy import *
from functools import reduce

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Check-well-formedness" data-toc-modified-id="Check-well-formedness-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Check well-formedness</a></span></li><li><span><a href="#Preprocess-and-transform-into-ndarray" data-toc-modified-id="Preprocess-and-transform-into-ndarray-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Preprocess and transform into <code>ndarray</code></a></span></li><li><span><a href="#Export" data-toc-modified-id="Export-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export</a></span></li></ul></div>

# Motivation

This notebook illustrates usage of the package, focusing exclusively on loading and preparing data. We will:
 - look at some example input data (specifying a phonological feature system)
 - turn that data into a ternary matrix (NumPy ndarray)
 - export our transformed data and metadata.

# Load data

In [3]:
%cd ..

/mnt/cube/home/AD/emeinhar/prague


In [4]:
import numpy as np

In [5]:
import prague

In [6]:
%ls data

bakovic_chart_riggle_hayes_remapped.tsv  brh.npy             hayes_remapped.tsv
bakovic_chart_riggle_hayes.tsv           hayes_features.txt  hayes.tsv
brh_features.txt                         hayes.npy


In [7]:
print(prague.convert.__doc__)


Module to take
 - a tab-separated value file specifying a binary or ternary feature matrix.
   - the file is assumed to have a header row indicating feature labels
   - each non-header row represents an object's feature vector
   - features are assumed to be {`+`,`-`,`0`} (following phonological
     convention) by default
 - a (potentially -- comma-separated -- list of) column name(s) indicating any
   columns not containing feature-value information (e.g. an object's label =
  IPA symbol in the case of phonological feature matrices).

and write
 - the ordered list of feature names (reflecting the ordering in the input
   `.tsv`) to a `.txt` file
 - a serialized numpy ndarray `.npy` file representing the unique feature
   vectors of the `.tsv` file as a matrix, with each row corresponding to
   an object and feature values represented as `1`, `-1`, or `0`
 - a tab-separated-value file with `+` replaced with `1` and `-` replaced with
   `-1`.

Note that if your feature matrix contains

In [8]:
# %cat data/hayes.tsv | head -n 5
%cat data/bakovic_chart_riggle_hayes.tsv | head -n 5

symbol	syll	cons	son	labial	coronal	dorsal	voice	cont	del. rel.	ant	dist	strid	high	low	back	front	approx	nas	lat	s.g.	c.g.	round	ATR
p	-	+	-	+	-	-	-	-	-	0	0	0	0	0	0	0	-	-	-	-	-	0	0
b	-	+	-	+	-	-	+	-	-	0	0	0	0	0	0	0	-	-	-	-	-	0	0
ɸ	-	+	-	+	-	-	-	+	+	0	+	-	0	0	0	0	-	-	-	-	-	0	0
β	-	+	-	+	-	-	+	+	+	0	+	-	0	0	0	0	-	-	-	-	-	0	0


In [9]:
# my_input_filepath = 'data/hayes.tsv'
my_input_filepath = 'data/bakovic_chart_riggle_hayes.tsv'

my_columns_to_remove = ('symbol',)

# my_output_filepath = 'data/hayes.npy'
my_output_filepath = 'data/brh.npy'

# my_features_list_output_filepath = 'data/hayes_features.txt'
my_features_list_output_filepath = 'data/brh_features.txt'

In [10]:
my_objects = prague.convert.load_objects(my_input_filepath)
my_objects[:10]

[OrderedDict([('symbol', 'p'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '-'),
              ('cont', '-'),
              ('del. rel.', '-'),
              ('ant', '0'),
              ('dist', '0'),
              ('strid', '0'),
              ('high', '0'),
              ('low', '0'),
              ('back', '0'),
              ('front', '0'),
              ('approx', '-'),
              ('nas', '-'),
              ('lat', '-'),
              ('s.g.', '-'),
              ('c.g.', '-'),
              ('round', '0'),
              ('ATR', '0')]),
 OrderedDict([('symbol', 'b'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '+'),
              ('cont', '-'),
              ('de

In [11]:
len(my_objects)

94

# Check well-formedness

 - We want to make sure every object is defined on the same set of features.
 - We at least want a heads-up about duplicate objects.

In [12]:
print(prague.convert.have_universal_feature_definitions.__doc__)


    Returns True iff all objects are defined for the same set of features.

    If behavior is 'Exception', then this function will raise an exception
    if this property does not hold of the set of objects.
    


In [13]:
prague.convert.have_universal_feature_definitions(my_objects, 
                                                  behavior='Exception')

True

In [14]:
print(prague.convert.objects_are_unique.__doc__)


    Returns True iff all objects are unique (excluding features in
    `features_to_ignore`).

    If behavior is 'Exception', then this function will raise an exception if
    this property does not hold of the set of objects.
    


In [15]:
prague.convert.objects_are_unique(my_objects,
                                  features_to_ignore=None,
                                  behavior='Exception')

True

In [16]:
len(my_objects)

objects_with_duplicates = lfilter(lambda o: prague.convert.has_duplicates(o, 
                                                                          my_objects),
                                  my_objects)
len(objects_with_duplicates)

94

0

The `symbol` key or feature is currently responsible for maintaining uniqueness in our demo data:

In [17]:
my_columns_to_remove

len(my_objects)

objects_with_duplicates = lfilter(lambda o: prague.convert.has_duplicates(o,
                                                                          my_objects,
                                                                          my_columns_to_remove,),
                                  my_objects)
len(objects_with_duplicates)

('symbol',)

94

6

A subsequent processing step will (optionally) remove all of these duplicates.

We can shorten both of these checks (for uniqueness at a certain level and universality of feature definition) with one call to:

In [18]:
print(prague.convert.sanitized_objects.__doc__)


    Given a collection of objects (dicts),
     - checks that all objects are defined for the same set of features. If
       they're not, this function will raise an exception.

    If duplicate_behavior is `Exception`, this will also check if there are
    any duplicate objects (with equality up to `features_to_ignore`) and raises
    an exception if so.

    Returns True if no exceptions are raised.
    


In [19]:
prague.convert.sanitized_objects(my_objects, duplicate_behavior='Exception',
                                 features_to_ignore=None)

True

# Preprocess and transform into `ndarray`

In [20]:
print(prague.convert.preprocess_objects.__doc__)


    Given a sanitized collection of objects (dicts) and keys to be removed
    (e.g. symbol columns), this function creates a copy of objects, and then
     - remaps values to integers
     - removes the keys in keys_to_remove

    and returns the resulting collection of feature vectors.
    


In [21]:
my_columns_to_remove
my_preprocessed_objects = prague.convert.preprocess_objects(my_objects, 
                                                            keys_to_remove=my_columns_to_remove)
my_preprocessed_objects[:5]

('symbol',)

[OrderedDict([('syll', -1),
              ('cons', 1),
              ('son', -1),
              ('labial', 1),
              ('coronal', -1),
              ('dorsal', -1),
              ('voice', -1),
              ('cont', -1),
              ('del. rel.', -1),
              ('ant', 0),
              ('dist', 0),
              ('strid', 0),
              ('high', 0),
              ('low', 0),
              ('back', 0),
              ('front', 0),
              ('approx', -1),
              ('nas', -1),
              ('lat', -1),
              ('s.g.', -1),
              ('c.g.', -1),
              ('round', 0),
              ('ATR', 0)]),
 OrderedDict([('syll', -1),
              ('cons', 1),
              ('son', -1),
              ('labial', 1),
              ('coronal', -1),
              ('dorsal', -1),
              ('voice', 1),
              ('cont', -1),
              ('del. rel.', -1),
              ('ant', 0),
              ('dist', 0),
              ('strid', 0),
           

In [22]:
print(prague.convert.to_ternary_feature_vectors.__doc__)


    Given a preprocessed collection of N objects (dicts) and an optional
    ordering of the M features of the objects, this returns an M x N ternary
    NumPy ndarray representing the collection.

    If feature_ordering is not specified, then the features of the first object
    will be sorted and used.

    If remove_duplicates is False (or if objects contains no duplicates), this
    will preserve the ordering (if any) in objects.

    If remove_duplicates is True, this function will return a tuple where the
    second value is a list of deleted indices.
    


In [23]:
my_feature_ordering = tuple(sorted(my_preprocessed_objects[0].keys()))
my_feature_ordering

('ATR',
 'ant',
 'approx',
 'back',
 'c.g.',
 'cons',
 'cont',
 'coronal',
 'del. rel.',
 'dist',
 'dorsal',
 'front',
 'high',
 'labial',
 'lat',
 'low',
 'nas',
 'round',
 's.g.',
 'son',
 'strid',
 'syll',
 'voice')

In [24]:
my_objects_np_no_dups, removed_indices = prague.convert.to_ternary_feature_vectors(my_preprocessed_objects,
                                                                                   remove_duplicates=True,
                                                                                   feature_ordering=my_feature_ordering)
my_objects_np_no_dups[:5]
print(removed_indices)
my_objects_np_no_dups.shape
my_objects_np_no_dups.dtype

array([[ 0,  0, -1,  0, -1,  1, -1, -1, -1,  0, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1,  0, -1, -1],
       [ 0,  0, -1,  0, -1,  1, -1, -1, -1,  0, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1,  0, -1,  1],
       [ 0,  0, -1,  0, -1,  1,  1, -1,  1,  1, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1, -1, -1, -1],
       [ 0,  0, -1,  0, -1,  1,  1, -1,  1,  1, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1, -1, -1,  1],
       [ 0,  0, -1,  0, -1,  1,  1, -1,  1, -1, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1,  1, -1, -1]], dtype=int8)

[40, 65, 84]


(91, 23)

dtype('int8')

To keep track of the mapping from symbols to binary feature vectors, we'll neglect to do de-duplication right here and now:

In [25]:
my_objects_np = prague.convert.to_ternary_feature_vectors(my_preprocessed_objects,
                                                          remove_duplicates=False,
                                                          feature_ordering=my_feature_ordering)
my_objects_np.shape
my_objects_np.dtype

(94, 23)

dtype('int8')

As you can see, ordering of objects has been preserved across transformations here:

In [26]:
my_objects[0:2]

[OrderedDict([('symbol', 'p'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '-'),
              ('cont', '-'),
              ('del. rel.', '-'),
              ('ant', '0'),
              ('dist', '0'),
              ('strid', '0'),
              ('high', '0'),
              ('low', '0'),
              ('back', '0'),
              ('front', '0'),
              ('approx', '-'),
              ('nas', '-'),
              ('lat', '-'),
              ('s.g.', '-'),
              ('c.g.', '-'),
              ('round', '0'),
              ('ATR', '0')]),
 OrderedDict([('symbol', 'b'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '+'),
              ('cont', '-'),
              ('de

In [27]:
my_preprocessed_objects[0:2]

[OrderedDict([('syll', -1),
              ('cons', 1),
              ('son', -1),
              ('labial', 1),
              ('coronal', -1),
              ('dorsal', -1),
              ('voice', -1),
              ('cont', -1),
              ('del. rel.', -1),
              ('ant', 0),
              ('dist', 0),
              ('strid', 0),
              ('high', 0),
              ('low', 0),
              ('back', 0),
              ('front', 0),
              ('approx', -1),
              ('nas', -1),
              ('lat', -1),
              ('s.g.', -1),
              ('c.g.', -1),
              ('round', 0),
              ('ATR', 0)]),
 OrderedDict([('syll', -1),
              ('cons', 1),
              ('son', -1),
              ('labial', 1),
              ('coronal', -1),
              ('dorsal', -1),
              ('voice', 1),
              ('cont', -1),
              ('del. rel.', -1),
              ('ant', 0),
              ('dist', 0),
              ('strid', 0),
           

In [28]:
my_objects_np[0:2].T

array([[ 0,  0],
       [ 0,  0],
       [-1, -1],
       [ 0,  0],
       [-1, -1],
       [ 1,  1],
       [-1, -1],
       [-1, -1],
       [-1, -1],
       [ 0,  0],
       [-1, -1],
       [ 0,  0],
       [ 0,  0],
       [ 1,  1],
       [-1, -1],
       [ 0,  0],
       [-1, -1],
       [ 0,  0],
       [-1, -1],
       [-1, -1],
       [ 0,  0],
       [-1, -1],
       [-1,  1]], dtype=int8)

In [29]:
len(my_objects)
my_symbols = lmap(lambda d: d['symbol'],
                  my_objects)
my_symbols[:5]

94

['p', 'b', 'ɸ', 'β', 'f']

In [30]:
symbol_to_ternary_feature_vector_map = dict(zip(my_symbols, my_objects_np))

In [31]:
print(symbol_to_ternary_feature_vector_map['i'])
[o for o in my_objects if o['symbol'] == 'i'][0]

[ 1  0  1 -1 -1 -1  1 -1  0  0  1  1  1 -1 -1 -1 -1 -1 -1  1  0  1  1]


OrderedDict([('symbol', 'i'),
             ('syll', '+'),
             ('cons', '-'),
             ('son', '+'),
             ('labial', '-'),
             ('coronal', '-'),
             ('dorsal', '+'),
             ('voice', '+'),
             ('cont', '+'),
             ('del. rel.', '0'),
             ('ant', '0'),
             ('dist', '0'),
             ('strid', '0'),
             ('high', '+'),
             ('low', '-'),
             ('back', '-'),
             ('front', '+'),
             ('approx', '+'),
             ('nas', '-'),
             ('lat', '-'),
             ('s.g.', '-'),
             ('c.g.', '-'),
             ('round', '-'),
             ('ATR', '+')])

To remove duplicates, we can make use of the relevant NumPy functionality:

In [32]:
my_objects_np.shape

(94, 23)

In [33]:
my_unique_objects_np = np.unique(my_objects_np, axis=0)
my_unique_objects_np.shape

(91, 23)

# Export

In [34]:
print(prague.convert.export_ternary_feature_vectors.__doc__)


    Writes the object matrix and the sequence of feature labels to the specified filepaths.

    The object matrix is saved using `np.save` (i.e. as .npy file) and the feature labels
    are written to a textfile.
    


In [35]:
my_output_filepath
my_features_list_output_filepath

'data/brh.npy'

'data/brh_features.txt'

In [36]:
prague.convert.export_ternary_feature_vectors(my_objects_np,
                                              my_feature_ordering,
                                              my_output_filepath,
                                              my_features_list_output_filepath)

In [37]:
%ls data

bakovic_chart_riggle_hayes_remapped.tsv  brh.npy             hayes_remapped.tsv
bakovic_chart_riggle_hayes.tsv           hayes_features.txt  hayes.tsv
brh_features.txt                         hayes.npy
