In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Motivation" data-toc-modified-id="Motivation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Motivation</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Overhead" data-toc-modified-id="Overhead-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Overhead</a></span></li><li><span><a href="#Identifying-a-possible-Boolean-concept" data-toc-modified-id="Identifying-a-possible-Boolean-concept-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Identifying a possible Boolean concept</a></span></li></ul></div>

# Motivation

In this notebook we show `prague` can be used to identify feature vectors compatible with (or that exactly pick out) an observed set of objects, using phonology as a motivating example. Some implementation details potentially relevant for usage are also discussed.

# Load data

In [2]:
%cd ..

/mnt/cube/home/AD/emeinhar/prague


In [3]:
from funcy import *
from functools import reduce

In [4]:
import numpy as np

In [5]:
import prague

In [6]:
%ls data

bakovic_chart_riggle_hayes_remapped.tsv  brh.npy             hayes_remapped.tsv
bakovic_chart_riggle_hayes.tsv           hayes_features.txt  hayes.tsv
brh_features.txt                         hayes.npy


In [7]:
# objects_in_fp = 'data/hayes.tsv'
# objects_np_in_fp = 'data/hayes.npy'
# feature_list_fp = 'data/hayes_features.txt'

objects_in_fp = 'data/bakovic_chart_riggle_hayes.tsv'
objects_np_in_fp = 'data/brh.npy'
feature_list_fp = 'data/brh_features.txt'

In [8]:
objects = prague.load_objects(objects_in_fp)
len(objects)
objects[:3]

symbols = [o['symbol'] for o in objects]
symbols[:10]

94

[OrderedDict([('symbol', 'p'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '-'),
              ('cont', '-'),
              ('del. rel.', '-'),
              ('ant', '0'),
              ('dist', '0'),
              ('strid', '0'),
              ('high', '0'),
              ('low', '0'),
              ('back', '0'),
              ('front', '0'),
              ('approx', '-'),
              ('nas', '-'),
              ('lat', '-'),
              ('s.g.', '-'),
              ('c.g.', '-'),
              ('round', '0'),
              ('ATR', '0')]),
 OrderedDict([('symbol', 'b'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '+'),
              ('cont', '-'),
              ('de

['p', 'b', 'ɸ', 'β', 'f', 'v', 't̪', 'd̪', 'θ', 'ð']

In [9]:
len(objects[0].keys())

24

In [10]:
feature_list = []
with open(feature_list_fp, 'r') as feature_file:
    for feature in feature_file:
        feature_list.append(feature.strip())
len(feature_list)

23

In [11]:
assert len(feature_list) == len(objects[0].keys()) - 1

In [12]:
objects_np = np.load(objects_np_in_fp)
objects_np.shape
objects_np.dtype

(94, 23)

dtype('int8')

In [13]:
assert objects_np.shape[1] == len(feature_list)

In [14]:
assert objects_np.shape[0] == len(objects)

In [15]:
objects[:2]

[OrderedDict([('symbol', 'p'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '-'),
              ('cont', '-'),
              ('del. rel.', '-'),
              ('ant', '0'),
              ('dist', '0'),
              ('strid', '0'),
              ('high', '0'),
              ('low', '0'),
              ('back', '0'),
              ('front', '0'),
              ('approx', '-'),
              ('nas', '-'),
              ('lat', '-'),
              ('s.g.', '-'),
              ('c.g.', '-'),
              ('round', '0'),
              ('ATR', '0')]),
 OrderedDict([('symbol', 'b'),
              ('syll', '-'),
              ('cons', '+'),
              ('son', '-'),
              ('labial', '+'),
              ('coronal', '-'),
              ('dorsal', '-'),
              ('voice', '+'),
              ('cont', '-'),
              ('de

In [16]:
objects_np[:2]

array([[ 0,  0, -1,  0, -1,  1, -1, -1, -1,  0, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1,  0, -1, -1],
       [ 0,  0, -1,  0, -1,  1, -1, -1, -1,  0, -1,  0,  0,  1, -1,  0,
        -1,  0, -1, -1,  0, -1,  1]], dtype=int8)

In [17]:
symbol_to_pfv = {o['symbol']:objects_np[i]
                 for i,o in enumerate(objects)}

In [18]:
objects_np.shape
unique_objects_np = np.unique(objects_np, axis=0)
unique_objects_np.shape

(94, 23)

(91, 23)

In [19]:
f"|O| = {len(objects)} distinct object types (including labels)"
f"{unique_objects_np.shape[1]} object features"
"{0:.2E} logically possible object feature vectors".format(2**unique_objects_np.shape[1])
f"|V| = {unique_objects_np.shape[0]} distinct object feature vectors"
"{0:.2E} possible subsets of V".format(2**unique_objects_np.shape[0])
"{0:.2E} possible partial feature vectors".format(3**unique_objects_np.shape[1])

'|O| = 94 distinct object types (including labels)'

'23 object features'

'8.39E+06 logically possible object feature vectors'

'|V| = 91 distinct object feature vectors'

'2.48E+27 possible subsets of V'

'9.41E+10 possible partial feature vectors'

# Overhead

In [20]:
from tqdm import tqdm

In [21]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        914M        115G        3.0M        9.3G        123G
Swap:          2.0G          0B        2.0G


In [22]:
#7s on wittgenstein
upper_closures = [prague.upper_closure(o, strict=False) 
                  for o in tqdm(unique_objects_np)]

100%|██████████| 91/91 [00:05<00:00, 15.45it/s]


In [23]:
upper_closures_as_matrix = np.concatenate(upper_closures)

In [24]:
#2m on wittgenstein
all_pfvs_with_nonempty_extension = np.unique(upper_closures_as_matrix, 
                                             return_index=False,
                                             axis=0)

In [39]:
all_pfvs_with_nonempty_extension.shape

(9115112, 23)

In [47]:
print("# partial feature vectors that pick out a non-empty subset of O: "
      "{0:.2E}\n".format(all_pfvs_with_nonempty_extension.shape[0]),
      "# logically possible partial feature vectors: "
      "{0:.2E}".format(3**all_pfvs_with_nonempty_extension.shape[1]))

# partial feature vectors that pick out a non-empty subset of O: 9.12E+06
 # logically possible partial feature vectors: 9.41E+10


In [25]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.2G        114G        3.0M        9.3G        122G
Swap:          2.0G          0B        2.0G


In [26]:
# 2m on wittgenstein
# unique_upper_closures = set(lmap(prague.HashableArray,
#                                  list(upper_closures_as_matrix)))

In [27]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.2G        114G        3.0M        9.3G        122G
Swap:          2.0G          0B        2.0G


In [28]:
# >>>2m on wittgenstein, uses way too much memory
# unique_upper_closures2 = set(lmap(prague.HashableArray,
#                                   cat(map(list, upper_closures_as_matrix))))

In [29]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        2.2G        114G        3.0M        9.3G        122G
Swap:          2.0G          0B        2.0G


In [30]:
#48s on wittgenstein
nonempty_pfv_extensions = prague.extensions(all_pfvs_with_nonempty_extension,
                                            object_inventory=unique_objects_np)

In [31]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           125G        3.0G        113G        3.0M        9.3G        121G
Swap:          2.0G          0B        2.0G


In [32]:
a_random_pfv = prague.feature_vector.make_random_pfv(num_features=len(feature_list))
random_extension = prague.extension(a_random_pfv, 
                                    object_inventory=unique_objects_np)
random_objects = prague.extension_vector_to_objects(random_extension, 
                                                    object_inventory=unique_objects_np)
my_x = random_extension
my_S = random_objects


while my_x.sum() == 0:
    a_random_pfv = prague.feature_vector.make_random_pfv(num_features=len(feature_list))
    random_extension = prague.extension(a_random_pfv, 
                                        object_inventory=unique_objects_np)
    random_objects = prague.extension_vector_to_objects(random_extension, 
                                                        object_inventory=unique_objects_np)
    my_x = random_extension
    my_S = random_objects


print(f"My random PFV = {a_random_pfv}")
print(f"My extension as an 'indicator' vector = \n{my_x}")
print(f"My extension as a stack of {my_S.shape[0]} objects S = \n{my_S}")

My random PFV = [ 0  0  0  1  0  1  0  0  0  0  1  0  0 -1  1  0 -1  0  0  0  0  0  1]
My extension as an 'indicator' vector = 
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
My extension as a stack of 1 objects S = 
[[ 0  0  1  1 -1  1 -1 -1  0  0  1 -1  1 -1  1 -1 -1  0 -1  1  0 -1  1]]


In [52]:
pfvs_containing_my_S = prague.get_pfvs_whose_extension_contains(my_S)
pfvs_exactly_matching_my_S = prague.get_pfvs_whose_extension_is_exactly(my_S, 
                                                                        object_inventory=unique_objects_np)
specification_of_exact_matches = np.abs(pfvs_exactly_matching_my_S).sum(axis=1)
minimal_specification = np.min(specification_of_exact_matches)
minimal_pfvs_exactly_matching_my_s = pfvs_exactly_matching_my_S[specification_of_exact_matches == minimal_specification]

print("{0:,} PFVs".format(pfvs_containing_my_S.shape[0]),
      f"whose extension contains S = \n{pfvs_containing_my_S}")
print("{0:,} PFVs".format(pfvs_exactly_matching_my_S.shape[0]),
      f"whose extension is exactly S = \n{pfvs_exactly_matching_my_S}")
print("{0:,} PFVs".format(minimal_pfvs_exactly_matching_my_s.shape[0]),
      f"whose extension is exactly S and which are maximally simple"
      f" (i.e. unspecified) = \n{minimal_pfvs_exactly_matching_my_s}")
print(f"original generating pfv = \n {a_random_pfv}")

131,072 PFVs whose extension contains S = 
[[ 0  0  1 ...  0 -1  1]
 [ 0  0  0 ...  0 -1  1]
 [ 0  0  1 ...  0 -1  1]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
69,312 PFVs whose extension is exactly S = 
[[ 0  0  1 ...  0 -1  1]
 [ 0  0  0 ...  0 -1  1]
 [ 0  0  1 ...  0 -1  1]
 ...
 [ 0  0  1 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
2 PFVs whose extension is exactly S and which are maximally simple (i.e. unspecified) = 
[[ 0  0  0  0  0  0  0  0  0  0  0 -1  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]]
original generating pfv = 
 [ 0  0  0  1  0  1  0  0  0  0  1  0  0 -1  1  0 -1  0  0  0  0  0  1]


# Identifying a possible Boolean concept

Consider the two sets of objects defined below:

In [18]:
from random import choice

In [19]:
num_observations = 4
random_observation = [choice(objects) for each in range(num_observations)]
lmap(lambda o: o['symbol'],
     random_observation)
random_observation

['ɑ̃ː', 'ɶː', 'ʑ̥', 'ɡ̠̥']

[OrderedDict([('symbol', 'ɑ̃ː'),
              ('anterior', '0'),
              ('approximant', '+'),
              ('back', '+'),
              ('consonantal', '-'),
              ('constricted glottis', '-'),
              ('continuant', '+'),
              ('coronal', '-'),
              ('delayed_release', '0'),
              ('diphthong', '-'),
              ('distributed', '0'),
              ('dorsal', '+'),
              ('front', '-'),
              ('front-diphthong', '0'),
              ('high', '-'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '+'),
              ('low', '+'),
              ('nasal', '+'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '0'),
              ('syllabic', '+'),
              ('tap', '-'),
              ('tense', '0'),
       

In [20]:
# matching_symbols = {'t','d','p','b','k','g'} #will probably consume ALL available memory a few cells downstream
matching_symbols = {'j','w'}
non_random_observation = lfilter(lambda o: o['symbol'] in matching_symbols,
                                 objects)
len(non_random_observation)
non_random_observation

2

[OrderedDict([('symbol', 'j'),
              ('anterior', '0'),
              ('approximant', '+'),
              ('back', '-'),
              ('consonantal', '-'),
              ('constricted glottis', '-'),
              ('continuant', '+'),
              ('coronal', '-'),
              ('delayed_release', '0'),
              ('diphthong', '0'),
              ('distributed', '0'),
              ('dorsal', '+'),
              ('front', '+'),
              ('front-diphthong', '0'),
              ('high', '+'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '-'),
              ('nasal', '-'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '0'),
              ('syllabic', '-'),
              ('tap', '-'),
              ('tense', '+'),
         

Suppose we didn't know how either set of observations were generated, but that we want to know if all of the examples in each set of observations are instances of at least one Boolean concept (i.e. plausibly generated by/instances of the same defining partial feature vector).

`prague`'s main functionality is to facilitate this kind of calculation and analysis via two functions:

In [21]:
print(prague.get_pfvs_whose_extension_contains.__doc__)


    Given
        a set of observed objects (a stack of feature vectors)
    this returns
        the set of partial feature vectors (a stack, one vector per row)
    whose extension must contain the set of observed objects.
    


In [22]:
print(prague.get_pfvs_whose_extension_is_exactly.__doc__)


    Given
        a set of observed objects (a stack of feature vectors)
        a set of potentially observable objects (another stack of vectors)
    this returns
        the set of partial feature vectors (a stack, one vector per row)
    whose extension must be exactly the set of observed objects.
    


In [23]:
random_observation_pfvs = np.array([symbol_to_pfv[o['symbol']]
                                   for o in random_observation])
random_observation_pfvs

array([[ 0,  1,  1, -1, -1,  1, -1,  0, -1,  0,  1, -1,  0, -1, -1, -1,
        -1,  1,  1,  1, -1,  1,  1, -1, -1,  0,  1, -1,  0, -1,  1],
       [ 0,  1, -1, -1, -1,  1, -1,  0, -1,  0,  1,  1,  0, -1,  1, -1,
        -1,  1,  1, -1,  1,  1,  1, -1, -1,  0,  1, -1,  0, -1,  1],
       [ 1, -1, -1,  1, -1,  1,  1,  1,  0,  1,  1,  1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  0, -1, -1],
       [ 0, -1,  1,  1, -1, -1, -1, -1,  0,  0,  1, -1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1, -1, -1, -1,  0, -1, -1,  0, -1, -1]],
      dtype=int8)

In [24]:
possible_explanations_for_random_observation = prague.get_pfvs_whose_extension_contains(random_observation_pfvs)
possible_explanations_for_random_observation.shape
possible_explanations_for_random_observation

(32768, 31)

array([[ 0,  0,  0, ...,  0, -1,  0],
       [ 0,  0,  0, ...,  0, -1,  0],
       [ 0,  0,  0, ...,  0, -1,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int8)

In [33]:
a_compatible_concept = choice(possible_explanations_for_random_observation)
a_compatible_concept
prague.extension(a_compatible_concept, unique_objects_np)

array([ 0,  0,  0,  0,  0,  1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0,  1,  0, -1, -1,  0,  0, -1,  0,  0,  0], dtype=int8)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)

In [26]:
# precise_explanations_for_random_observation = prague.get_pfvs_whose_extension_is_exactly(random_observation_pfvs,
#                                                                                          objects_np)
# precise_explanations_for_random_observation.shape
# precise_explanations_for_random_observation

In [25]:
non_random_observation_pfvs = np.array([symbol_to_pfv[o['symbol']]
                                        for o in non_random_observation])
non_random_observation_pfvs

array([[ 0,  1, -1, -1, -1,  1, -1,  0,  0,  0,  1,  1,  0,  1, -1, -1,
        -1, -1, -1, -1, -1,  1,  1, -1, -1,  0, -1, -1,  1, -1,  1],
       [ 0,  1,  1, -1, -1,  1, -1,  0,  0,  0,  1, -1,  0,  1,  1, -1,
        -1, -1, -1, -1,  1,  1,  1, -1, -1,  0, -1, -1,  1, -1,  1]],
      dtype=int8)

In [26]:
possible_explanations_for_non_random_observation = prague.get_pfvs_whose_extension_contains(non_random_observation_pfvs)
possible_explanations_for_non_random_observation.shape
possible_explanations_for_non_random_observation

(2097152, 31)

array([[ 0,  1,  0, ...,  1, -1,  1],
       [ 0,  0,  0, ...,  1, -1,  1],
       [ 0,  1,  0, ...,  1, -1,  1],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int8)

In [27]:
# precise_explanations_for_non_random_observation = prague.get_pfvs_whose_extension_is_exactly(non_random_observation_pfvs,
#                                                                                              objects_np)
# precise_explanations_for_non_random_observation.shape
# precise_explanations_for_non_random_observation