Calculate the information entropy of a random variable that showed the following behavior:
A, B, B, A, C, B, A, C, A, B, C, A, A, A, A, A, A, A, C, A

In [16]:
from math import log2
import csv

In [1]:
observation = ["A", "B", "B", "A", "C", "B", "A", "C", "A", "B", "C", "A", "A", "A", "A", "A", "A", "A", "C", "A"]

# Expected self-information

In [10]:
# Calculate probability of each occurence
probabilities = dict((x, observation.count(x)/len(observation)) for x in set(observation))
probabilities

{'C': 0.2, 'A': 0.6, 'B': 0.2}

In [12]:
# Create self-information function
def H(x):
    """
    x: probability space
    """
    self_information = -1*sum([p*log2(p) for p in x])
    return self_information

In [15]:
prob_space = [v for v in probabilities.values()]
H(prob_space)

1.3709505944546687

## Entropy of a dataset

In [35]:
rows = 0
with open('./data/world-series.csv', mode='r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    # store column names
    headers = next(reader, None)
#     print(headers)
    
    # store data
    column = dict()
    for h in headers:
        column[h] = []
    
    for row in reader:
        for h, v in zip(headers, row):
            column[h].append(v)
        rows+=1
f.close()
print("Num. rows read:", rows)
print("First 3 entries: \ufeffYear:", column['\ufeffYear'][:3])
print("First 3 entries: X (American):", column['X (American)'][:3])
print("First 3 entries: Y (National):", column['Y (National)'][:3])

Num. rows read: 20
First 3 entries: ï»¿Year: ['2001', '2002', '2003']
First 3 entries: X (American): ['Yankees', 'Angels', 'Yankees']
First 3 entries: Y (National): ['Diamondbacks', 'Giants', 'Marlins']


### Amount of uncertainty in American MLB league from 2001 - 2020

In [70]:
counts_X = [column["X (American)"].count(v) for v in list(set(column["X (American)"]))]
# counts = [column["Y (National)"].count(v) for v in list(set(column["Y (National)"]))]

print("Num. American League teams:", len(list(set(column["X (American)"]))))
print("Tally of wins:")
print({v:column["X (American)"].count(v) for v in list(set(column["X (American)"]))}, "\n")

y_entropy = -1*sum([p*log2(p) for p in [count/sum(counts_X) for count in counts_X]])
print(y_entropy)

Num. American League teams: 10
Tally of wins:
{'Indians': 1, 'Red Sox': 4, 'White Sox': 1, 'Angels': 1, 'Yankees': 3, 'Rangers': 2, 'Tigers': 2, 'Astros': 2, 'Rays': 2, 'Royals': 2} 

3.1841837197791882


In [53]:
# Effective number of teams
pow(2, y_entropy)

9.08939153177016

### Joint Entropy of a dataset