In [2]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Identifying-a-possible-Boolean-concept" data-toc-modified-id="Identifying-a-possible-Boolean-concept-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Identifying a possible Boolean concept</a></span></li></ul></div>

# Motivation

In this notebook we show `prague` can be used to identify feature vectors compatible with (or that exactly pick out) an observed set of objects, using phonology as a motivating example. Some implementation details potentially relevant for usage are also discussed.

# Load data

In [3]:
%cd ..

/mnt/cube/home/AD/emeinhar/prague


In [4]:
from funcy import *
from functools import reduce

In [5]:
import numpy as np

In [6]:
import prague

In [7]:
%ls data

bakovic_chart_riggle_hayes_remapped.tsv  brh.npy             hayes_remapped.tsv
bakovic_chart_riggle_hayes.tsv           hayes_features.txt  hayes.tsv
brh_features.txt                         hayes.npy


In [8]:
objects_in_fp = 'data/hayes.tsv'
objects_np_in_fp = 'data/hayes.npy'
feature_list_fp = 'data/hayes_features.txt'

In [9]:
objects = prague.load_objects(objects_in_fp)
len(objects)
objects[:3]

symbols = [o['symbol'] for o in objects]
symbols[:10]

345

[OrderedDict([('symbol', 'n̩'),
              ('anterior', '+'),
              ('approximant', '-'),
              ('back', '0'),
              ('consonantal', '+'),
              ('constricted glottis', '-'),
              ('continuant', '-'),
              ('coronal', '+'),
              ('delayed_release', '0'),
              ('diphthong', '0'),
              ('distributed', '-'),
              ('dorsal', '-'),
              ('front', '0'),
              ('front-diphthong', '0'),
              ('high', '0'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '0'),
              ('nasal', '+'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '-'),
              ('syllabic', '+'),
              ('tap', '-'),
              ('tense', '0'),
        

['n̩', 'k͡p', 'i', 'ɡ͡b', 'ʑ', 'ʕ', 'ã', 'ɣ', 'ɶ̃', 'ɵː']

In [10]:
len(objects[0].keys())

32

In [11]:
feature_list = []
with open(feature_list_fp, 'r') as feature_file:
    for feature in feature_file:
        feature_list.append(feature)
len(feature_list)

31

In [12]:
assert len(feature_list) == len(objects[0].keys()) - 1

In [13]:
objects_np = np.load(objects_np_in_fp)
objects_np.shape
objects_np.dtype

(345, 31)

dtype('int8')

In [14]:
assert objects_np.shape[1] == len(feature_list)

In [15]:
assert objects_np.shape[0] == len(objects)

In [16]:
objects[:2]

[OrderedDict([('symbol', 'n̩'),
              ('anterior', '+'),
              ('approximant', '-'),
              ('back', '0'),
              ('consonantal', '+'),
              ('constricted glottis', '-'),
              ('continuant', '-'),
              ('coronal', '+'),
              ('delayed_release', '0'),
              ('diphthong', '0'),
              ('distributed', '-'),
              ('dorsal', '-'),
              ('front', '0'),
              ('front-diphthong', '0'),
              ('high', '0'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '0'),
              ('nasal', '+'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '-'),
              ('syllabic', '+'),
              ('tap', '-'),
              ('tense', '0'),
        

In [17]:
objects_np[:2]

array([[ 1, -1,  0,  1, -1, -1,  1,  0,  0, -1, -1,  0,  0,  0, -1, -1,
        -1, -1,  0,  1, -1,  1,  1, -1, -1, -1,  1, -1,  0, -1,  1],
       [ 0, -1,  0,  1, -1, -1, -1, -1,  0,  0,  1,  0,  0,  1,  1, -1,
        -1, -1, -1, -1, -1,  1, -1, -1, -1,  0, -1, -1,  0, -1, -1]],
      dtype=int8)

In [18]:
symbol_to_pfv = {o['symbol']:objects_np[i]
                 for i,o in enumerate(objects)}

# Identifying a possible Boolean concept

Consider the two sets of objects defined below:

In [19]:
from random import choice

In [20]:
num_observations = 4
random_observation = [choice(objects) for each in range(num_observations)]
lmap(lambda o: o['symbol'],
     random_observation)
random_observation

['ɘ̥', 'ʑ̥', 'k̟x̟', 'oʊ']

[OrderedDict([('symbol', 'ɘ̥'),
              ('anterior', '0'),
              ('approximant', '+'),
              ('back', '-'),
              ('consonantal', '-'),
              ('constricted glottis', '-'),
              ('continuant', '+'),
              ('coronal', '-'),
              ('delayed_release', '0'),
              ('diphthong', '-'),
              ('distributed', '0'),
              ('dorsal', '+'),
              ('front', '-'),
              ('front-diphthong', '0'),
              ('high', '-'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '-'),
              ('nasal', '-'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '0'),
              ('syllabic', '+'),
              ('tap', '-'),
              ('tense', '+'),
        

In [21]:
# matching_symbols = {'t','d','p','b','k','g'} #will probably consume ALL available memory a few cells downstream
matching_symbols = {'j','w'}
non_random_observation = lfilter(lambda o: o['symbol'] in matching_symbols,
                                 objects)
len(non_random_observation)
non_random_observation

2

[OrderedDict([('symbol', 'j'),
              ('anterior', '0'),
              ('approximant', '+'),
              ('back', '-'),
              ('consonantal', '-'),
              ('constricted glottis', '-'),
              ('continuant', '+'),
              ('coronal', '-'),
              ('delayed_release', '0'),
              ('diphthong', '0'),
              ('distributed', '0'),
              ('dorsal', '+'),
              ('front', '+'),
              ('front-diphthong', '0'),
              ('high', '+'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '-'),
              ('nasal', '-'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '0'),
              ('syllabic', '-'),
              ('tap', '-'),
              ('tense', '+'),
         

Suppose we didn't know how either set of observations were generated, but that we want to know if all of the examples in each set of observations are instances of at least one Boolean concept (i.e. plausibly generated by/instances of the same defining partial feature vector).

`prague`'s main functionality is to facilitate this kind of calculation and analysis via two functions:

In [22]:
print(prague.get_pfvs_whose_extension_contains.__doc__)

Given
        a set of observed objects (a stack of feature vectors)
    this returns
        the set of partial feature vectors (a stack, one vector per row)
    whose extension must contain the set of observed objects.
    


In [23]:
print(prague.get_pfvs_whose_extension_is_exactly.__doc__)

Given
        a set of observed objects (a stack of feature vectors)
        a set of potentially observable objects (another stack of vectors)
    this returns
        the set of partial feature vectors (a stack, one vector per row)
    whose extension must be exactly the set of observed objects.
    


In [24]:
random_observation_pfvs = np.array([symbol_to_pfv[o['symbol']]
                                   for o in random_observation])
random_observation_pfvs

array([[ 0,  1, -1, -1, -1,  1, -1,  0, -1,  0,  1, -1,  0, -1, -1, -1,
        -1, -1, -1, -1, -1,  1,  1, -1, -1,  0,  1, -1,  1, -1, -1],
       [ 1, -1, -1,  1, -1,  1,  1,  1,  0,  1,  1,  1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  0, -1, -1],
       [ 0, -1, -1,  1, -1, -1, -1,  1,  0,  0,  1,  1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1, -1, -1, -1,  0, -1, -1,  0, -1, -1],
       [ 0,  1,  1, -1, -1,  1, -1,  0,  1,  0,  1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1,  1,  1,  1, -1,  1,  0,  1, -1,  1, -1,  1]],
      dtype=int8)

In [25]:
possible_explanations_for_random_observation = prague.get_pfvs_whose_extension_contains(random_observation_pfvs)
possible_explanations_for_random_observation.shape
possible_explanations_for_random_observation

(1048576, 31)

array([[ 0,  0, -1, ...,  1, -1, -1],
       [ 0,  0,  0, ...,  1, -1, -1],
       [ 0,  0, -1, ...,  1, -1, -1],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0, -1, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int8)

In [26]:
# precise_explanations_for_random_observation = prague.get_pfvs_whose_extension_is_exactly(random_observation_pfvs,
#                                                                                          objects_np)
# precise_explanations_for_random_observation.shape
# precise_explanations_for_random_observation

In [27]:
non_random_observation_pfvs = np.array([symbol_to_pfv[o['symbol']]
                                        for o in non_random_observation])
non_random_observation_pfvs

array([[ 0,  1, -1, -1, -1,  1, -1,  0,  0,  0,  1,  1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1,  1, -1, -1,  0, -1, -1,  1, -1,  1],
       [ 0,  1,  1, -1, -1,  1, -1,  0,  0,  0,  1, -1,  0,  1,  1, -1,
        -1, -1, -1, -1,  1,  1,  1, -1, -1,  0, -1, -1,  1, -1,  1]],
      dtype=int8)

In [28]:
possible_explanations_for_non_random_observation = prague.get_pfvs_whose_extension_contains(non_random_observation_pfvs)
possible_explanations_for_non_random_observation.shape
possible_explanations_for_non_random_observation

(2097152, 31)

array([[ 0,  1,  0, ...,  1, -1,  1],
       [ 0,  0,  0, ...,  1, -1,  1],
       [ 0,  1,  0, ...,  1, -1,  1],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int8)

In [29]:
precise_explanations_for_non_random_observation = prague.get_pfvs_whose_extension_is_exactly(non_random_observation_pfvs,
                                                                                             objects_np)
precise_explanations_for_non_random_observation.shape
precise_explanations_for_non_random_observation

ValueError: operands could not be broadcast together with shapes (2,345) (2097152,345) 