In [46]:
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import log
from collections import Counter
from nltk.util import ngrams
import pandas as pd
import plotly.express as px
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

data = [row for row in csv.DictReader(open('aligned_data.tsv'), delimiter='\t')]

In [54]:
MODE = 'raw' # how to represent states, one of "raw", "endpoints", "objects"
OBJS = False # whether to include "used objects" as features (separate from the states)
POS = 'VERB' # which POS to use
WIN_SIZE = 1 # word use +/- how many states?
K = 2 # how large of ngrams to use (max length will be K+1)
CUTOFF = 2 # how many unique people need do actions have to occur with? 
SUPERVISED = False # whether the dimensionality reduction has access to labels

In [55]:
def make_heatmap(M, row_names, col_names, row_tots, col_tots,
                 col_cutoff = 20, row_cutoff = 20, row_scale=0.15, col_scale=0.2, cmap='coolwarm',
                sort = None, annotate=False, saveas=None):
    
    if (row_cutoff is not None) and (col_cutoff is not None):          
        plot_M = M[row_tots>row_cutoff, :][:, col_tots>col_cutoff]
        plot_row = [w for i, w in enumerate(row_names) if row_tots[i] > row_cutoff]
        plot_col = [w for i, w in enumerate(col_names) if col_tots[i] > col_cutoff]
    else:
        plot_M = M
        plot_row = row_names
        plot_col = col_names
        
    if sort == 'diag':
        idxs = np.argmax(plot_M, axis=1) 
        order = np.argsort(idxs)
        sorted_M = plot_M[order]
        plot_M = sorted_M
        plot_row = [plot_row[i] for i in order]
    elif sort == 'strength':
        idxs = np.max(plot_M, axis=1) 
        order = np.flip(np.argsort(idxs))
        sorted_M = plot_M[order]
        plot_M = sorted_M
        plot_row = [plot_row[i] for i in order]
    elif sort == 'row':
        rows = sorted(list(set(plot_row)))
        order = np.argsort([rows.index(n) for n in plot_row])
        sorted_M = plot_M[order]
        plot_M = sorted_M
        plot_row = [plot_row[i] for i in order]
        
    height, width = plot_M.shape
    plt.figure(figsize=(int(round(width*col_scale)), int(round(height*row_scale))))

    if annotate:
        sns.heatmap(plot_M, cmap=cmap, annot=True, fmt=".01f")
    else:
        sns.heatmap(plot_M, cmap=cmap, cbar=False)
    plt.yticks(np.arange(plot_M.shape[0])+0.5, plot_row, rotation=0)
    plt.xticks(np.arange(plot_M.shape[1])+0.5, plot_col, rotation=90)
    if saveas:
        plt.savefig(saveas, bbox_inches='tight')
    plt.show()
    plt.clf()
    
def get_pmi(freq):
    col_tots = np.sum(freq, axis=0)
    row_tots = np.sum(freq, axis=1)
    tot = np.sum(freq)
    
    rows, cols = freq.shape
    PMI = np.zeros((rows, cols))
    for i in range(rows):
        for j in range(cols):
            pmi = (freq[i][j]/tot) / ((row_tots[i]/tot) * (col_tots[j]/tot))
            PMI[i][j] = log(pmi) if pmi > 0 else 0
    return PMI
    
def make_mats(D, POS = 'VERB', window_size=0):
    poslst = POS.split(',')
    zs = sorted(list(set([r['z'] for r in data])),
                key=lambda e:int(e) if not e == 'NA' else -1)
    words = sorted(list(set([r['lemma'] for r in D if r['pos'] in poslst])))
  
    
    N = len(D)
    freq = np.zeros((len(words), len(zs)))
    straight_count = np.zeros((len(words), len(zs)))
    for i, d in enumerate(D):
        if d['pos'] in poslst:
            w = d['lemma']
            for j in range(max(0, i-window_size), min(i+window_size, N)+1):
                s = D[j]['z']
                freq[words.index(w)][zs.index(s)] += 1
                if i == j:
                    straight_count[words.index(w)][zs.index(s)] += 1
    
    col_tots = np.sum(straight_count, axis=0)
    row_tots = np.sum(straight_count, axis=1)
    PMI = get_pmi(freq)
    return freq, PMI, words, zs, row_tots, col_tots

def make_ngram_mats(D, vocab, K = 3, POS = 'VERB', window_size=0):
    poslst = POS.split(',')
    zs = vocab
    words = sorted(list(set([r['lemma'] for r in D if r['pos'] in poslst])))

    N = len(D)
    freq = np.zeros((len(words), len(zs)))
    straight_count = np.zeros((len(words), len(zs)))
    for i, d in enumerate(D):
        w = d['lemma']
        if d['pos'] in poslst:
            lower = max(0, i-window_size)
            upper = min(i+window_size, N)+1
            for j in range(lower, upper):
                for k in range(K+1):
                    if j+k < len(D):
                        acts = []
                        for idx in range(j, j+k+1):
                            z = get_z(D[idx], MODE)
                            acts.append(z)
                        s = ' '.join(acts)
                        if s in zs:
                            freq[words.index(w)][zs.index(s)] += 1
                            if (i == j):
                                straight_count[words.index(w)][zs.index(s)] += 1
    
    col_tots = np.sum(straight_count, axis=0)
    row_tots = np.sum(straight_count, axis=1)
    
    PMI = get_pmi(freq)
    return freq, PMI, words, zs, row_tots, col_tots

def get_z(D, mode):
    z = D['z']
    if mode == 'endpoints':
        z += "-" if D["end_obj"] == "None" else "+"
    elif mode == 'objects':
        z += D["end_obj"]
    return z

def make_token_mats(D, _vocab, K = 1, window_size = 1, use_objects=True):   
    vocab = [v for v in _vocab if len(v.split()) <= K+1]
    if use_objects:
        objs = list(set([d['end_obj'] for d in D]))
        vocab += objs
    vocabset = set(vocab)
    N = len(D)
    freq = np.zeros((N, len(vocab)))
    lbls = []
    meta = []
    for i, d in enumerate(D):
        start = d['step']
        end = D[i+1]['step'] if i < N-1 else ''
        w = d['lemma']+'_'+d['pos']
        lbls.append(w)
        meta.append(d['participant'] + ' ' + d['task'] + ' ' + '%s-%s'%(start, end))
        lower = max(0, i-window_size)
        upper = min(i+window_size, N)
        for j in range(lower, upper):
            if use_objects:
                o = D[j]['end_obj']
                freq[i][vocab.index(o)] += 1
            for k in range(K+1):
                if j+k < len(D):
                    acts = []
                    for idx in range(j, j+k+1):
                        z = get_z(D[idx], MODE)
                        acts.append(z)
                    s = ' '.join(acts)
                    if s in vocabset:
                        freq[i][vocab.index(s)] += 1

    #PMI = get_pmi(freq)
    return freq, lbls, meta, vocab

In [56]:
# make sequences of states for computing ngrams over
seqs = {}
lasts = {}
for d in data:
    p = d['participant']
    t = d['task']
    k = (p,t)
    if k not in seqs:
        seqs[k] = [[], []]
        lasts[k] = [None, None]

    z = get_z(d, MODE)
    if not z == lasts[k][0]:
        seqs[k][0].append(z)
        lasts[k][0] = z
        
    w = '%s_%s'%(d['lemma'], d['pos'])
    if not w == lasts[k][1]:
        seqs[k][1].append(w)
        lasts[k][1] = w
        
# compute vocabularies to use later
vocab = [{}, {}] # actions, words
by_person = [{}, {}] # actions, words
ns = [K, 1] # how large of ngrams

for k, lsts in seqs.items():
    for i, lst in enumerate(lsts):
        for n in range(ns[i]+1):
            for ngm in ngrams(lst, n):
                w = ' '.join(['%s'%e for e in ngm])
                if w not in vocab[i]:
                    vocab[i][w] = 0
                    by_person[i][w] = set()
                vocab[i][w] += 1
                by_person[i][w].add(k[0])
                
vocab_lsts = [[], []]

for i, v in enumerate(vocab):
    for w, c in sorted(v.items(), key=lambda e:e[1], reverse=True):
        if len(by_person[i][w]) >= CUTOFF:
            vocab_lsts[i].append(w)
            
for l in vocab_lsts:
    print(len(l), l[:10])

92 ['3', '1', '2', '5', '8', '1 3', '7', '3 1', '4', '6']
577 ['NA_NA', '-PRON-_PRON', 'be_AUX', 'and_CCONJ', 'the_DET', 'go_VERB', 'to_PART', 'up_ADP', 'put_VERB', 'a_DET']


## Cluster word instances

In [57]:
X, y, meta, vocab = make_token_mats(data, vocab_lsts[0], K = K,
                                    window_size = WIN_SIZE, use_objects=OBJS)
print(X.shape)
print(len(y))

(31535, 92)
31535
96223.0


In [58]:
#top_words = [w for w in vocab_lsts[1] if w.split('_')[1] == POS][:20] 

top_words = []
for l in open('../nbc/target_words.txt').readlines():
    pos, w = l.strip().split('\t')
    if pos == POS:
        top_words.append('%s_%s'%(w, pos))
print(top_words)

is_top = [yy in top_words for yy in y]
X_plot = X[is_top, :]
y_plot = np.array(y)[is_top]
m_plot = np.array(meta)[is_top]
print(X_plot.shape)
print(np.sum(X_plot))

['pick_VERB', 'put_VERB', 'push_VERB', 'get_VERB', 'drop_VERB', 'throw_VERB', 'hold_VERB', 'open_VERB', 'close_VERB', 'give_VERB', 'go_VERB', 'shake_VERB', 'take_VERB', 'eat_VERB', 'wash_VERB', 'play_VERB', 'walk_VERB', 'cook_VERB', 'stop_VERB']
(1347, 92)
3725.0


In [59]:
#reducer = TruncatedSVD(n_components=2)
reducer = TSNE(n_components=2)

if SUPERVISED:
    red = reducer.fit_transform(X_plot, y_plot)
else:
    red = reducer.fit_transform(X_plot)

In [60]:
lemmas = [y.split('_')[0] for y in y_plot]
pos = [y.split('_')[1] for y in y_plot]
ps = [y.split(' ')[0] for y in m_plot]
ts = [y.split(' ')[1] for y in m_plot]
pts = [y.rsplit(' ', 1)[0] for y in m_plot]
steps = [y.split(' ')[2] for y in m_plot]

ddict = {'x': red[:,0], 'y': red[:,1], 'lemma': lemmas, 'pos': pos,
         'lemmapos': y_plot, 'participant': ps, 'task': ts, 'pt': pts, 'step': steps}

top_feat = [vocab[i] for i in np.argmax(X_plot, axis=1)]
ddict['top'] = top_feat
    
feats = ['participant', 'top', 'step', 'pt'] 

if red.shape[1] == 3:
    ddict['z'] = red[:,2]
    d = pd.DataFrame.from_dict(ddict)
    fig = px.scatter_3d(d, x="x", y="y", z="z", color='lemma',
                 hover_data=feats)
else:
    d = pd.DataFrame.from_dict(ddict)
    fig = px.scatter(d, x="x", y="y", color='lemma', hover_data=feats,
                    color_discrete_sequence=px.colors.qualitative.Light24)
    
name = '%s_supervised=%s_mode=%s_obj=%s_win=%s_k=%s'%(POS, SUPERVISED, MODE,
                                                      OBJS, WIN_SIZE, K)
fig.write_image("figures/%s.pdf"%name)
fig.show()

In [78]:
mat = X_plot
#r = TruncatedSVD(n_components=50)
#r = TSNE(n_components=50)
#mat = r.fit_transform(X_plot)

nbrs = NearestNeighbors(n_neighbors=6).fit(mat)
distances, indices = nbrs.kneighbors(mat)


by_verb = {}
for i in range(mat.shape[0]):
    v = y_plot[i]
    if v not in by_verb:
        by_verb[v] = []
    neighbors = [y_plot[j] for j in indices[i, 1:]]
    p = sum([1 if w == v else 0 for w in neighbors])/len(neighbors)
    by_verb[v].append(p)
    
macro = []
micro = []
tot = 0.
for v, lst in sorted(by_verb.items(), key=lambda e:len(e[1]), reverse=True):
    n = len(lst)
    tp = sum(lst)
    p = tp / n
    macro.append(p)
    micro.append(tp)
    tot += n
    print('%s\t%.02f\t%s'%(v, p, n))
    
print("Macro: %.02f"%(sum(macro)/len(macro)))
print("Micro: %.02f"%(sum(micro)/tot))

go_VERB	0.41	300
put_VERB	0.23	292
pick_VERB	0.23	223
take_VERB	0.12	104
eat_VERB	0.11	104
wash_VERB	0.07	51
play_VERB	0.07	51
get_VERB	0.10	50
hold_VERB	0.12	39
throw_VERB	0.10	35
walk_VERB	0.11	32
drop_VERB	0.05	26
stop_VERB	0.05	20
give_VERB	0.04	11
open_VERB	0.20	9
Macro: 0.14
Micro: 0.22


## Analyze word type <-> action cooccurances

In [None]:
freq, PMI, row_names, col_names, row_tots, col_tots = make_ngram_mats(data, vocab=vocab_lsts[0],
                                                                    POS = POS,
                                                                    window_size=WIN_SIZE, K=K)

make_heatmap(freq, row_names, col_names, row_tots, col_tots, col_cutoff=50,
                row_cutoff=50, row_scale=0.2, col_scale=0.2, sort="strength")