In [3]:
from matplotlib import pyplot as plt
from math import log2
import numpy as np
import seaborn as sns
import csv

# Mutual Information

Considering multiple variables simultaneously

In [176]:
rows = 0
with open('./data/world-series.csv', mode='r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    # store column names
    headers = next(reader, None)
#     print(headers)
    
    # store data
    column = dict()
    for h in headers:
        column[h] = []
    
    for row in reader:
        for h, v in zip(headers, row):
            column[h].append(v)
        rows+=1
f.close()
print("Num. rows read:", rows)
print("First 3 entries: \ufeffYear:", column['\ufeffYear'][:3])
print("First 3 entries: X (American):", column['X (American)'][:3])
print("First 3 entries: Y (National):", column['Y (National)'][:3])

Num. rows read: 20
First 3 entries: ﻿Year: ['2001', '2002', '2003']
First 3 entries: X (American): ['Yankees', 'Angels', 'Yankees']
First 3 entries: Y (National): ['Diamondbacks', 'Giants', 'Marlins']


In [36]:
# P(Rays)
# Rays
rays = len([team for team in column['X (American)'] if team == 'Rays'])
n = len(column['X (American)'])
print("P(Rays):\t", rays/n)
del rays
del n

P(Rays):	 0.1


In [35]:
# P(Rays, Dodgers)
xy = list(zip(column['X (American)'], column['Y (National)']))
raysdodgers = len([team for team in xy if team == ('Rays', 'Dodgers')])
n = len(xy)
print("P(Rays, Dodgers):\t", raysdodgers/n)
del xy
del raysdodgers
del n

P(Rays, Dodgers):	 0.05


In [45]:
# P(Rays | Dodgers)
xy = list(zip(column['X (American)'], column['Y (National)']))
dodgers_y = [team for team in xy if team[1] == 'Dodgers']
rays_x_dodgers_y = [team for team in dodgers_y if team[0] == 'Rays'] 
n = len(dodgers_y)
print("P(Rays | Dodgers):\t", len(rays_x_dodgers_y)/n)
del xy
del dodgers_y
del rays_x_dodgers_y
del n

P(Rays | Dodgers):	 0.3333333333333333


In [48]:
# P(Dodgers | Rays)
xy = list(zip(column['X (American)'], column['Y (National)']))
rays_y = [team for team in xy if team[0] == 'Rays']
dodgers_x_rays_y = [team for team in rays_y if team[1] == 'Dodgers'] 
n = len(rays_y)
print("P(Dodgers | Rays):\t", len(dodgers_x_rays_y)/n)
del xy
del rays_y
del dodgers_x_rays_y
del n

P(Dodgers | Rays):	 0.5


## Joint Entropy

In [88]:
# Entropy for joint pairs of teams
xy = list(zip(column['X (American)'], column['Y (National)']))
counts = list({v:xy.count(v) for v in xy}.values())
entropy = sum([-1*(p/sum(counts))*log2(p/sum(counts)) for p in counts])
print("Entropy:\t", entropy)
del xy
del counts
del entropy

Entropy:	 4.221928094887362


In [89]:
# Entropy if all teams were equally likely
xy = list(zip(column['X (American)'], column['Y (National)']))
log2(len(xy))

4.321928094887363

## Conditional Entropy
Expected entropy of Y when a specific event occurred in X

In [124]:
# Joint probability
x = 'Red Sox'
y = 'Cardinals'
xy = list(zip(column['X (American)'], column['Y (National)']))
pjoint = len([(teams[0], teams[1]) for teams in xy if teams == (x, y)])/len(xy)
print("len xy:\t", len(xy))
print("pjoint:\t", pjoint)
del x, y, xy, pjoint

# Make a function of it
x = 'Red Sox'
y = 'Cardinals'
X = column['X (American)']
Y = column['Y (National)']

def pjoint(x_, y_, X_, Y_):
    xy = list(zip(X_, Y_))
    n_xy = len(xy)
    
    joint_xy = [(x, y) for (x, y) in xy if (x, y) == (x_, y_)]
    prob_joint_xy = len(joint_xy)/n_xy
    return prob_joint_xy

pjoint(x, y, X, Y)

len xy:	 20
pjoint:	 0.1


0.1

In [118]:
# Conditional probability of y, given x
x = 'Red Sox'
y = 'Cardinals'

xy = list(zip(column['X (American)'], column['Y (National)']))
x_events = [teams for teams in xy if teams[0] == x]
y_cond_x = [teams for teams in x_events if teams[1] == y]
print("x_events:\t", x_events)
print("y_cond_x:\t", y_cond_x)
prob_y_cond_x = len(y_cond_x)/len(x_events)
print("p(y|x):\t", prob_y_cond_x)
del x, y, xy, x_events, y_cond_x, prob_y_cond_x

# Make a function of it
def pcond(y_, x_, Y_, X_):
    xy = list(zip(X_, Y_))
#     n_xy = len(xy)
    
    x_events = [x for x in xy if x[0] == x_]
    y_cond_x = [y for y in x_events if y[1] == y_]
    prob_y_cond_x = len(y_cond_x)/len(x_events)
    return prob_y_cond_x
    
pcond('Cardinals', 'Red Sox', Y_ = column['Y (National)'], X_ = column['X (American)'])

x_events:	 [('Red Sox', 'Cardinals'), ('Red Sox', 'Rockies'), ('Red Sox', 'Cardinals'), ('Red Sox', 'Dodgers')]
y_cond_x:	 [('Red Sox', 'Cardinals'), ('Red Sox', 'Cardinals')]
p(y|x):	 0.5


0.5

In [174]:
# Conditional entropy of y, given x 
# H(Y|X)
xy = list(zip(column['X (American)'], column['Y (National)']))
X = [xy[0] for xy in xy]
Y = [xy[1] for xy in xy]

def entropy_cond(Y_, X_):
    xy = list(zip(X_, Y_))
    
    # Initialize entropy to zero.
    entropy = 0
    
    for y in list(set(Y_)):
        for x in list(set(X_)):
            pjoint_xy = pjoint(x, y, X_, Y_)
            pcond_yx = pcond(y, x, Y_, X_)
            if pcond_yx != 0:
#                 print("x:", x, "y:", y)
#                 print("pjoint_xy:\t", pjoint_xy)
#                 print("pcond_yx:\t", pcond_yx)
#                 print("log2(pcond_yx):\t",log2(pcond_yx))
#                 print("-1*pjoint_xy*log2(pcond_yx):", -1*pjoint_xy*log2(pcond_yx))
#                 print("")
                entropy += -1*pjoint_xy*log2(pcond_yx)

            else:
                pass
    return entropy

print("Conditional entropy H(Y|X):\t", entropy_cond(Y, X))

Conditional entropy H(Y|X):	 1.0377443751081736


## Mutual information

In [207]:
y = column['Y (National)']
x = column['X (American)']
print("x:")
print(x)
print("\ncounts of x:")
print({team: y.count(team) for team in list(set(y))})

x_counts = [x.count(team) for team in list(set(x))]
x_probs = [p/sum(x_counts) for p in x_counts]
entropy = sum([-1*p*log2(p) for p in x_probs])
print("\nEntropy of X:\t", entropy)
print("2^entropy:", 2**entropy)

x:
['Yankees', 'Angels', 'Yankees', 'Red Sox', 'White Sox', 'Tigers', 'Red Sox', 'Rays', 'Yankees', 'Rangers', 'Rangers', 'Tigers', 'Red Sox', 'Royals', 'Royals', 'Indians', 'Astros', 'Red Sox', 'Astros', 'Rays']

counts of x:
{'Cubs': 1, 'Phillies': 2, 'Giants': 4, 'Cardinals': 4, 'Mets': 1, 'Marlins': 1, 'Diamondbacks': 1, 'Dodgers': 3, 'Nationals': 1, 'Astros': 1, 'Rockies': 1}

Entropy of X:	 3.1841837197791882
2^entropy: 9.089391531770154


In [206]:
# Create self-information function
# Accepts a list of probabilities of each event
def H(probs_):
    """
    probs: probability space
    """
    self_information = sum([-1*p*log2(p) for p in probs_])
    return self_information

H(x_probs)

3.184183719779189

In [221]:
xy = list(zip(column['X (American)'], column['Y (National)']))
x_counts = [x.count(team) for team in list(set(x))]
y_counts = [y.count(team) for team in list(set(y))]
xy_counts = [xy.count(teams) for teams in list(set(xy))]
# print({teams: xy.count(teams) for teams in list(set(xy))})

x_probs = [p/sum(x_counts) for p in x_counts]
y_probs = [p/sum(y_counts) for p in y_counts]
xy_probs = [p/sum(xy_counts) for p in xy_counts]

HX = H(x_probs)
print("H(X):\t", HX)
HY = H(y_probs)
print("H(Y):\t", HY)
HXY = H(xy_probs)
print("H(XY):\t", HXY)

H(X):	 3.1841837197791882
H(Y):	 3.184183719779189
H(XY):	 4.221928094887362


In [224]:
print("Mutual information:\t", HX + HY - HXY)

Mutual information:	 2.1464393446710153
