In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Eric Meinhardt / emeinhardt@ucsd.edu**

# Introduction

This notebook will take 
 - a `.tsv` file specifying a binary or ternary feature matrix.
   - the file is assumed to have a header row indicating feature labels
   - each non-header row represents an object's feature vector
   - features are assumed to be {`+`,`-`,`0`} (following phonological convention) by default
 - a (potentially -- comma-separated -- list of) column name(s) indicating any columns not containing feature-value information (e.g. an object's label = IPA symbol in the case of phonological feature matrices).
 
and write
 - the ordered list of feature names (reflecting the ordering in the input `.tsv`) to a `.txt` file
 - a serialized numpy ndarray `.npy` file representing the unique feature vectors of the `.tsv` file as a matrix, with each row corresponding to an object and feature values represented as `1`, `-1`, or `0`
 - a `.tsv` with `+` replaced with `1` and `-` replaced with `-1`.

Note that if your feature matrix contains multiple objects that have the same featural description, this notebook will detect and record as much, but the final matrix it produces will only have one row for each unique feature vector.

# Parameters

In [2]:
from os import getcwd, chdir, listdir, path, mkdir, makedirs

**Note:** the cell below has a cell tag `parameters`, meaning this notebook will work with the `papermill` package.

In [3]:
# parameters cell

# i = ''
# i = 'bakovic_chart_riggle_hayes.tsv'
i = 'hayes.tsv'

# c = ''
c = 'symbol'

# f = ''
# # f = '+,-,0'
# # f = '1,-1,0'

# u = ''
# # u = 'tolerate'
# # u = 'raise_exception'

# o = 
# o = 'brh.npy'
o = 'hayes.npy'

# d = ''
# d = 'brh_features.txt'
d = 'hayes_features.txt'

In [4]:
output_dir = path.dirname(o)
if not path.exists(output_dir) and output_dir != '':
    print('Making output path {0}'.format(output_dir))
    makedirs(output_dir)

In [5]:
columns_to_remove = c.split(',')
columns_to_remove

['symbol']

# Imports / load data

In [6]:
from funcy import *

In [7]:
import os
from copy import deepcopy

In [8]:
import numpy as np

In [9]:
import csv

In [10]:
objects = []

with open(i, encoding='utf-8-sig') as csvfile:
    my_reader = csv.DictReader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')
    for row in my_reader:
        #print(row)
        objects.append(row)
len(objects)
objects[:5]

345

[OrderedDict([('symbol', 'n̩'),
              ('anterior', '+'),
              ('approximant', '-'),
              ('back', '0'),
              ('consonantal', '+'),
              ('constricted glottis', '-'),
              ('continuant', '-'),
              ('coronal', '+'),
              ('delayed_release', '0'),
              ('diphthong', '0'),
              ('distributed', '-'),
              ('dorsal', '-'),
              ('front', '0'),
              ('front-diphthong', '0'),
              ('high', '0'),
              ('labial', '-'),
              ('labiodental', '-'),
              ('lateral', '-'),
              ('long', '-'),
              ('low', '0'),
              ('nasal', '+'),
              ('round', '-'),
              ('segment', '+'),
              ('sonorant', '+'),
              ('spread glottis', '-'),
              ('stress', '-'),
              ('strident', '-'),
              ('syllabic', '+'),
              ('tap', '-'),
              ('tense', '0'),
        

# Check that every object is defined on the same set of features

In [11]:
features = lmap(lambda d: tuple(d.keys()),
                objects)
values = lmap(lambda d: set(d.values()),
              objects)

In [12]:
assert all(features[0] == features[i] for i in range(len(features)))

In [13]:
features = features[0]
features
original_fieldnames = deepcopy(features)

('symbol',
 'anterior',
 'approximant',
 'back',
 'consonantal',
 'constricted glottis',
 'continuant',
 'coronal',
 'delayed_release',
 'diphthong',
 'distributed',
 'dorsal',
 'front',
 'front-diphthong',
 'high',
 'labial',
 'labiodental',
 'lateral',
 'long',
 'low',
 'nasal',
 'round',
 'segment',
 'sonorant',
 'spread glottis',
 'stress',
 'strident',
 'syllabic',
 'tap',
 'tense',
 'trill',
 'voice')

In [14]:
values

[{'+', '-', '0', 'n̩'},
 {'+', '-', '0', 'k͡p'},
 {'+', '-', '0', 'i'},
 {'+', '-', '0', 'ɡ͡b'},
 {'+', '-', '0', 'ʑ'},
 {'+', '-', '0', 'ʕ'},
 {'+', '-', '0', 'ã'},
 {'+', '-', '0', 'ɣ'},
 {'+', '-', '0', 'ɶ̃'},
 {'+', '-', '0', 'ɵː'},
 {'+', '-', '0', 'dʒ'},
 {'+', '-', '0', 'd̪ɮ̪'},
 {'+', '-', '0', 'ɺ'},
 {'+', '-', '0', 'ʂ'},
 {'+', '-', '0', 'ɾ̥'},
 {'+', '-', '0', 'w̥'},
 {'+', '-', '0', 'ʉ'},
 {'+', '-', '0', 'b'},
 {'+', '-', '0', 'o̥'},
 {'+', '-', '0', 'ɔ'},
 {'+', '-', '0', 'ɪ'},
 {'+', '-', '0', 'b͡β'},
 {'+', '-', '0', 'ãː'},
 {'+', '-', '0', 'ɽ̥'},
 {'+', '-', '0', 'ɯː'},
 {'+', '-', '0', 'ɨ̥'},
 {'+', '-', '0', 'ɲ'},
 {'+', '-', '0', 'e'},
 {'+', '-', '0', 'ɪ̃ː'},
 {'+', '-', '0', 'aʊ'},
 {'+', '-', '0', 'ʎ'},
 {'+', '-', '0', 'b͡v'},
 {'+', '-', '0', 'õː'},
 {'+', '-', '0', 'ẽː'},
 {'+', '-', '0', 'r'},
 {'+', '-', '0', 'ʉ̃'},
 {'+', '-', '0', 'ɥ'},
 {'+', '-', '0', 'ŋ̟̥'},
 {'+', '-', '0', 'ʉ̃ː'},
 {'+', '-', '0', 'ɒː'},
 {'+', '-', '0', 'øː'},
 {'+', '-', '0', 'k

# Check uniqueness of objects

In [15]:
def areIdentical(a, b):
    for k in features:
        if k not in columns_to_remove:
            if a[k] != b[k]:
                return False
    return True

In [16]:
def get_matches(obj):
    return [o for o in objects if areIdentical(obj, o)]

In [17]:
def hasDuplicates(obj):
    return len(get_matches(obj)) > 1

In [18]:
objectsWithDuplicates = [o for o in objects if hasDuplicates(o)]
len(objectsWithDuplicates)

118

In [19]:
for obj in objectsWithDuplicates:
    print(obj['symbol'])
    for m in get_matches(obj):
        print('\t{0}'.format(m['symbol']))
    print('\n')

k͡p
	k͡p
	kp


ɡ͡b
	ɡ͡b
	ɡb


dʒ
	dʒ
	d͡ʒ
	ʤ


d̪ɮ̪
	d̪ɮ̪
	d̪͡ɮ̪


ʂ
	ʂ
	ʐ̥


b͡β
	b͡β
	bβ


b͡v
	b͡v
	bv


k͡x
	k͡x
	kx


ɢʁ
	ɢʁ
	ɢ͡ʁ


ɣ̥
	ɣ̥
	x


p͡ɸ
	p͡ɸ
	pɸ


t̪͡θ
	t̪͡θ
	t̪θ


ɟʝ
	ɟʝ
	ɟ͡ʝ


p͡t
	p͡t
	pt


t
	t
	d̥


ɣ̟̥
	ɣ̟̥
	x̟


ɟ̥
	ɟ̥
	c


q
	q
	ɢ̥


v̥
	v̥
	f


t̪s̪
	t̪s̪
	t̪͡s̪


kx
	k͡x
	kx


t͡ɬ
	t͡ɬ
	tɬ


ɡ̠͡ɣ̠
	ɡ̠͡ɣ̠
	ɡ̠ɣ̠


d͡ʒ
	dʒ
	d͡ʒ
	ʤ


ɡɣ
	ɡɣ
	ɡ͡ɣ


d͡z
	d͡z
	dz


t͡s
	t͡s
	ts


ɢ̥
	q
	ɢ̥


ʈ
	ʈ
	ɖ̥


d̠ɮ̠
	d̠ɮ̠
	d̠͡ɮ̠


p͡f
	p͡f
	pf


β̥
	β̥
	ɸ


t̪͡ɬ̪
	t̪͡ɬ̪
	t̪ɬ̪


ʤ
	dʒ
	d͡ʒ
	ʤ


ʝ̥
	ʝ̥
	ç


k̠
	k̠
	ɡ̠̥


t̪ɬ̪
	t̪͡ɬ̪
	t̪ɬ̪


ɦ̥
	ɦ̥
	h


ɖʐ
	ɖʐ
	ɖ͡ʐ


f
	v̥
	f


ɡ̠̥
	k̠
	ɡ̠̥


d̪z̪
	d̪z̪
	d̪͡z̪


bv
	b͡v
	bv


d̠͡ɮ̠
	d̠ɮ̠
	d̠͡ɮ̠


p
	p
	b̥


ç
	ʝ̥
	ç


ɬ
	ɬ
	ɮ̥


tɕ
	tɕ
	t͡ɕ


ʈ͡ʂ
	ʈ͡ʂ
	ʈʂ


b̥
	p
	b̥


d͡ʑ
	d͡ʑ
	dʑ


k̟x̟
	k̟x̟
	k̟͡x̟


c
	ɟ̥
	c


cç
	cç
	c͡ç


ts
	t͡s
	ts


bβ
	b͡β
	bβ


h
	ɦ̥
	h


k̟
	k̟
	ɡ̟̥


x̟
	ɣ̟̥
	x̟


ɣ̠̥
	ɣ̠̥
	x̠


b͡d
	b͡d
	bd


ɡ̟ɣ̟
	ɡ̟ɣ̟
	ɡ̟͡ɣ̟


s
	s
	z̥


d̪͡ɮ̪
	d̪ɮ̪
	d̪͡ɮ̪


ɮ̥
	ɬ
	ɮ̥


ɸ
	β̥
	ɸ


ɡ͡ɣ
	ɡɣ
	ɡ͡ɣ


ɖ͡ʐ
	ɖʐ
	ɖ͡ʐ




# Remap +/-/0 to +1/-1/0

In [20]:
valueRemap = {'+':1,
              '-':-1,
              '0':0}

def remapValue(v):
    return valueRemap[v]

remapValue('-')

def remapValues(d):
    for k in d:
        if d[k] in valueRemap:
            d[k] = remapValue(d[k])
    return d

-1

In [21]:
objects = [remapValues(o) for o in objects]
objects

[OrderedDict([('symbol', 'n̩'),
              ('anterior', 1),
              ('approximant', -1),
              ('back', 0),
              ('consonantal', 1),
              ('constricted glottis', -1),
              ('continuant', -1),
              ('coronal', 1),
              ('delayed_release', 0),
              ('diphthong', 0),
              ('distributed', -1),
              ('dorsal', -1),
              ('front', 0),
              ('front-diphthong', 0),
              ('high', 0),
              ('labial', -1),
              ('labiodental', -1),
              ('lateral', -1),
              ('long', -1),
              ('low', 0),
              ('nasal', 1),
              ('round', -1),
              ('segment', 1),
              ('sonorant', 1),
              ('spread glottis', -1),
              ('stress', -1),
              ('strident', -1),
              ('syllabic', 1),
              ('tap', -1),
              ('tense', 0),
              ('trill', -1),
              ('voice',

In [22]:
objects_with_remapped_values = deepcopy(objects)

# Remove undesired columns

In [23]:
features = tuple([f for f in features if f not in columns_to_remove])
features

('anterior',
 'approximant',
 'back',
 'consonantal',
 'constricted glottis',
 'continuant',
 'coronal',
 'delayed_release',
 'diphthong',
 'distributed',
 'dorsal',
 'front',
 'front-diphthong',
 'high',
 'labial',
 'labiodental',
 'lateral',
 'long',
 'low',
 'nasal',
 'round',
 'segment',
 'sonorant',
 'spread glottis',
 'stress',
 'strident',
 'syllabic',
 'tap',
 'tense',
 'trill',
 'voice')

In [24]:
for obj in objects:
    for c in columns_to_remove:
        del obj[c]

# Identify feature/dimension labels

In [25]:
dimension_labels = tuple(k for k in features)
dimension_labels

('anterior',
 'approximant',
 'back',
 'consonantal',
 'constricted glottis',
 'continuant',
 'coronal',
 'delayed_release',
 'diphthong',
 'distributed',
 'dorsal',
 'front',
 'front-diphthong',
 'high',
 'labial',
 'labiodental',
 'lateral',
 'long',
 'low',
 'nasal',
 'round',
 'segment',
 'sonorant',
 'spread glottis',
 'stress',
 'strident',
 'syllabic',
 'tap',
 'tense',
 'trill',
 'voice')

# Project off feature labels + convert each object to a tuple of values + unique-ify the set of objects

In [26]:
def toTuple(o):
    return tuple([o[f] for f in dimension_labels])

In [27]:
from random import choice

In [28]:
random_object = choice(objects)
random_object

OrderedDict([('anterior', 0),
             ('approximant', -1),
             ('back', 0),
             ('consonantal', 1),
             ('constricted glottis', -1),
             ('continuant', -1),
             ('coronal', -1),
             ('delayed_release', -1),
             ('diphthong', 0),
             ('distributed', 0),
             ('dorsal', 1),
             ('front', 0),
             ('front-diphthong', 0),
             ('high', 1),
             ('labial', -1),
             ('labiodental', -1),
             ('lateral', -1),
             ('long', -1),
             ('low', -1),
             ('nasal', -1),
             ('round', -1),
             ('segment', 1),
             ('sonorant', -1),
             ('spread glottis', -1),
             ('stress', -1),
             ('strident', 0),
             ('syllabic', -1),
             ('tap', -1),
             ('tense', 0),
             ('trill', -1),
             ('voice', 1)])

In [29]:
toTuple(random_object)

(0,
 -1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 0,
 0,
 1,
 0,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 0,
 -1,
 1)

In [30]:
objects_tupled = tuple([toTuple(o) for o in objects])
objects_tupled

((1,
  -1,
  0,
  1,
  -1,
  -1,
  1,
  0,
  0,
  -1,
  -1,
  0,
  0,
  0,
  -1,
  -1,
  -1,
  -1,
  0,
  1,
  -1,
  1,
  1,
  -1,
  -1,
  -1,
  1,
  -1,
  0,
  -1,
  1),
 (0,
  -1,
  0,
  1,
  -1,
  -1,
  -1,
  -1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  0,
  -1,
  -1,
  0,
  -1,
  -1),
 (0,
  1,
  -1,
  -1,
  -1,
  1,
  -1,
  0,
  -1,
  0,
  1,
  1,
  0,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  1,
  -1,
  -1,
  0,
  1,
  -1,
  1,
  -1,
  1),
 (0,
  -1,
  0,
  1,
  -1,
  -1,
  -1,
  -1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  0,
  -1,
  -1,
  0,
  -1,
  1),
 (1,
  -1,
  -1,
  1,
  -1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  0,
  -1,
  1),
 (0,
  -1,
  1,
  1,
  -1,
  1,
  -1,
  -1,
  0,
  0,
  1,
  -1,
  0,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1

In [31]:
len(objects_tupled)
len(set(objects_tupled))

345

285

In [32]:
unique_objects = set(objects_tupled)

In [33]:
unique_objects_np = np.array([o for o in unique_objects], dtype='int8')
unique_objects_np.shape
unique_objects_np.dtype
unique_objects_np

(285, 31)

dtype('int8')

array([[-1, -1,  0, ...,  0, -1, -1],
       [ 0,  1, -1, ..., -1, -1,  1],
       [ 0,  1, -1, ...,  1, -1, -1],
       ...,
       [ 0, -1,  0, ...,  0, -1, -1],
       [ 0,  1, -1, ...,  1, -1,  1],
       [ 1,  1,  0, ...,  0, -1,  1]], dtype=int8)

# Export

In [34]:
with open(os.path.splitext(i)[0] + '_remapped.tsv', 'w', newline='\n') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=original_fieldnames, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='@')

    writer.writeheader()
    writer.writerows(objects_with_remapped_values)

In [35]:
objects_with_remapped_values

[OrderedDict([('symbol', 'n̩'),
              ('anterior', 1),
              ('approximant', -1),
              ('back', 0),
              ('consonantal', 1),
              ('constricted glottis', -1),
              ('continuant', -1),
              ('coronal', 1),
              ('delayed_release', 0),
              ('diphthong', 0),
              ('distributed', -1),
              ('dorsal', -1),
              ('front', 0),
              ('front-diphthong', 0),
              ('high', 0),
              ('labial', -1),
              ('labiodental', -1),
              ('lateral', -1),
              ('long', -1),
              ('low', 0),
              ('nasal', 1),
              ('round', -1),
              ('segment', 1),
              ('sonorant', 1),
              ('spread glottis', -1),
              ('stress', -1),
              ('strident', -1),
              ('syllabic', 1),
              ('tap', -1),
              ('tense', 0),
              ('trill', -1),
              ('voice',

In [36]:
%%capture

with open(d, 'w') as the_file:
    for seq in dimension_labels:
        the_file.write(seq + '\n')

In [37]:
d

'hayes_features.txt'

In [38]:
!cat 'brh_features.txt'

syll
cons
son
labial
coronal
dorsal
voice
cont
del. rel.
ant
dist
strid
high
low
back
front
approx
nas
lat
s.g.
c.g.
round
ATR


In [39]:
o

'hayes.npy'

In [40]:
np.save(o, unique_objects_np)