# Measuring Communication Efficiency and Learnability of Colors using an Information Bottleneck Framework

In [None]:
import pandas as pd
import numpy as np
from ibhelpers import *
from scipy.spatial import distance

In [2]:
# load the data from the World Color Survey
df = pd.read_csv("wcs/term.txt", delimiter="\t", header=None)
df.columns = ["language", "speaker", "chip", "word"]

## Calculate prior probabilities

For each language, $l$, we can use a frequentist approach (counts) to calculate the observed quantities of $p(w|c,l)$ and $p(c|w,l)$

In [3]:
per_chip_count_df = df.groupby(["language", "chip", "word"]).speaker.agg(individual_count_per_word_per_chip="count")
total_word_count_df = df.groupby(["language", "chip"]).word.agg(total_words_per_chip="count")

# frequentist probability of a word given chip and language
p_word_chip_language = per_chip_count_df["individual_count_per_word_per_chip"] / total_word_count_df["total_words_per_chip"]
p_word_chip_language.head(5)

language  chip  word
1         1     F       0.08
                G       0.52
                LB      0.36
                LF      0.04
          2     F       0.60
dtype: float64

In [4]:
per_word_count_df = df.groupby(["language", "word", "chip"]).speaker.agg(individual_count_per_chip_per_word="count")
total_chip_count_df = df.groupby(["language", "word"]).chip.agg(total_chips_per_word="count")

# frequentist probability of a chip given word and language
p_chip_word_language = per_word_count_df["individual_count_per_chip_per_word"] / total_chip_count_df["total_chips_per_word"]
p_chip_word_language.head(5)

language  word  chip
1         F     1       0.001366
                2       0.010246
                4       0.001366
                5       0.011612
                6       0.003415
dtype: float64

## Information Bottleneck

Code taken from Frank (osfstorage)

In [None]:
items = ['a', 'b', 'c', 'r', 'x', 'y', 'z']

# probabilities from Geoff's dissertation from Google N-grams (1985)
allp =  np.array([0.1034, 0.0795, 0.1839, 0.6183, 0.0074, 0.0048, 0.0028])
pastprops = allp[0:3]/np.sum(allp[0:3])
futprops = allp[4:7]/np.sum(allp[4:7])
duboisprobs = np.array([27.4, 47.5, 25.1]) # From Twitter Corpus
duboisprobs = duboisprobs / np.sum(duboisprobs)
p_x = np.concatenate( [duboisprobs[0] * pastprops, duboisprobs[[1]], duboisprobs[2]*futprops ] )
p_x = p_x / np.sum(p_x)

eps = 0.01
q0 = (1 - eps) * np.eye(7) + eps * np.ones((7, 7))

kap = 0.5
lam = 0.1

p_xGy = np.array(
    [[1, kap, kap ** 2, lam * kap ** 2, lam ** 2 * kap ** 2, lam ** 2 * kap ** 3, lam ** 2 * kap ** 4],
     [kap, 1, kap, lam * kap, lam ** 2 * kap, lam ** 2 * kap ** 2, lam ** 2 * kap ** 3],
     [kap ** 2, kap, 1, lam, lam ** 2, lam ** 2 * kap, lam ** 2 * kap ** 2],
     [lam * kap ** 2, lam * kap, lam, 1, lam, lam * kap, lam * kap ** 2],
     [lam ** 2 * kap ** 2, lam ** 2 * kap, lam ** 2, lam, 1, kap, kap ** 2],
     [lam ** 2 * kap ** 3, lam ** 2 * kap ** 2, lam ** 2 * kap, lam * kap, kap, 1, kap],
     [lam ** 2 * kap ** 4, lam ** 2 * kap ** 3, lam ** 2 * kap ** 2, lam * kap ** 2, kap ** 2, kap, 1]])

p_mGs = p_xGy / np.sum(p_xGy, axis=0)
p_xGy = p_xGy / p_xGy.sum(axis=1, keepdims=True)
p_xy = p_xGy * p_x[:, np.newaxis]
p_xy = p_xy / np.sum(p_xy) 

### The Pareto Frontier
Now let's run the Information Bottleneck Method

In [None]:
# trace out optimal frontier
q0 = q0 / q0.sum(axis=1, keepdims=True) # q0 initial encoder - can set it to identity
betas = np.array([2.0 ** x for x in np.arange(5, 0, -0.001)])
focalbeta = 5.3

q, beta, ibscores, qresult, qseq, qseqresults, allqs = fit_ib(p_xy, q0, focalbeta, betas, verbose=1)

# create data frames for plotting and analysis
ib_scores_df = pandas.DataFrame(np.array(ibscores), columns = ['rate', 'distortion', 'elen'])
ib_scores_df['beta'] = betas
ib_scores_df['q'] = allqs
ib_scores_df['Wn'] = [mergecols(q).shape[1] for q in ib_scores_df['q']]

# The structural phase transitions along the pareto frontier
stochSys = []
for i, q in enumerate(zip(qseq, qseqresults)):
    for w in mergecols(q[0]).transpose():
        stochSys.append([len(qseq)-i, q[1][0], q[1][1]] + list(w))

stochSys = pandas.DataFrame(data=np.array(stochSys), columns = ['n', 'rate', 'distortion'] + items)


In [5]:
# compute distance from optimal frontier
def fd(asys, ibscores):
    mind = distance.cdist([[asys['rate'], asys['distortion']]], ibscores[['rate', 'distortion']]).min()
    return mind


def gNID_d(asys, paretoQs, betas, pX):
    mind = np.zeros((len(asys), len(paretoQs)))
    for li in range(len(asys)):
        for qi, q in enumerate(paretoQs):
            mind[li, qi] = gNID(asys.iloc[li]['q'], q, pX)
    return np.argmin(mind, axis=1), np.min(mind, axis=1), betas[np.argmin(mind, axis=1)]

