In [None]:
import pandas as pd
import math
import numpy as np
from collections import Counter

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/grantaguinaldo/meetup/master/world-series.csv')

In [None]:
df.rename(columns={'X (American)': 'x_american', 'Y (National)': 'y_national', 'Year': 'year'}, inplace=True)
df['combined'] = df['x_american'] + ', ' + df['y_national']
df

### Notes on Information Theory (Self-Information is defined for an individual event).
In information theory, the amount of infomration obtained from an event does not have to be an integer.

In [None]:
#How much infomration is obtained if you have a card that has the heart suit?
print('{:.4f} bits'.format(-math.log2(1/4)))
#Two bits of infomation gained.

In [None]:
#How much information is gained if you draw a random card from a standard 52-card deck and you get a king?
king_num = 4/52
print('{:.4f} bits'.format(-math.log2(king_num)))
#Conditional on seeing a king, the single draw gives you 3.7004 bits of information.

In [None]:
2**(3.7004)
# Computers are based on binary choices (on or off)

In [None]:
# Corresponds to one bit of infomration.
-math.log2(0.5)

In [None]:
# Calculate the amount of information of the following:
#Roll a die and observe a 6
#Roll two dice and the sum is six
#Roll a die and a 6 appears for the first time in the 10th roll.

In [None]:
# Calculate the amount of information of the following:
# Draw a card from a standard 52-card deck and observe a spade.
# Draw a card from a standard 52-card deck and observe a king.
# Draw a card from a standard 52-card deck and observe a king of spades.

prob_spade = 13/52
prob_king = 4/52
prob_spade_king = 1/52
prob_king_given_spade = 1/13

print('{:.4f} bits'.format(-math.log2(prob_spade)))
print('{:.4f} bits'.format(-math.log2(prob_king)))
print('{:.4f} bits'.format(-math.log2(prob_spade_king))) #Sum of info(prob_spade) + info(prob_king)
print('{:.4f} bits'.format(-math.log2(prob_king_given_spade)))

### Information Entropy (defined for a single probabilistic system).
- Event
- Sample Space
- Probability Space

Self Information is defined for each individual event that is observed and is calculated by the following:

\begin{equation}
I(p) = -log_2(p)
\end{equation}

Is it possible to measure the amount of infomraton for a stochastic system even before making observations?

The **amount of self information** is given by the following:

\begin{equation}
H(x) = \sum p_i \cdot I(p_i) = - \sum_{i=1}^{n} p_i \cdot log_2(p_i)
\end{equation}

$H(x)$ is the expected self-infomration
 
$H(x)$ means the following:
 
 - Average amount of self-infomration the observed could obtained by one observation
 - Average "newsworthiness" the observer should expect for one event.
 
Information entropy is minimal if the system state is uniquely determined with no fluctuation. It increaes as the randomness increases within the system. Entroy is maximal if the system is completely random (if every even it sequally likely to occur).

In [None]:
def information(s):
    # Returns the infomration in units of bits.
    return sum([-x*math.log2(x) for x in s])

In [None]:
print('{:.4f} bits'.format(information([1/3, 1/3, 1/3])))
print('{:.4f} bits'.format(information([1/2, 1/4, 1/4]))) # Information goes down if prob goes up.
print('{:.4f} bits'.format(information([1/4, 1/4, 1/4, 1/4])))

In [None]:
alpha_list = ['A', 'B', 'B', 'A', 'C', 'B', 'A', 'C', 'A', 'B', 
              'C', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'A']

In [None]:
alpha_dict = dict(Counter(alpha_list))
exp_self_info = sum([-p*math.log2(p) for p in [x/sum(list(alpha_dict.values())) for x in list(alpha_dict.values())]])
print('{:.4f} bits'.format(exp_self_info))

In [None]:
#Calculate entropy for an entire corpus.

#4.83 bits

In [None]:
2**4.83 #expected number of characters you can expect and is similar to latin alphabet.

In [None]:
# print('{:.4f} bits'.format(information([1/3, 1/3, 1/3, 0, 0]))) # Throws error: ValueError: math domain error
# Take limit as function approaches zero, and the limit is zero therefore, you can remove the 0 values.

### Differential Entropy

\begin{equation}
H_{diff}(X=k) = - \int \mathbb{P}(X=k) \cdot log_2 \cdot \mathbb{P}(X=k) \cdot dx
\end{equation}

Higher the differential entropy, the higher the uncertanity.

#### Butterfly Effect
- X: A butterfly in Brazil flaps it's wings, or not
- Y: A tornado appears in Texas, or not.

If probabilities are given for each event, you can calcualte the joint probability (assume independence).

\begin{align}
    \begin{pmatrix}
        \mathbb{P}(x1, y1) & \mathbb{P}(x1, y2) \\
        \mathbb{P}(x2, y1) & \mathbb{P}(x2, y2)
    \end{pmatrix}
\end{align}

Each probabilities can be calcualted by the individual event, assuming independence (conditional probability).

\begin{equation}
\mathbb{P}(x, y) = \mathbb{P}(y, x) = \mathbb{P}(x) \cdot \mathbb{P}(y)
\end{equation}

$\mathbb{P}(x | y) \cdot \mathbb{P}(y) = \mathbb{P}(y | x) \cdot \mathbb{P}(x)$

In [None]:
am_dict = dict(Counter(df.x_american.to_list()))
nat_dict = dict(Counter(df.y_national.to_list()))

In [None]:
am_values = list(am_dict.values())
nat_values = list(nat_dict.values())

In [None]:
nat_dict['Dodgers'] / sum(nat_values)

In [None]:
am_dict['Rays'] / sum(am_values)

In [None]:
p_rays = am_dict['Rays'] / sum(am_values)
p_rays

In [None]:
p_dodgers = nat_dict['Dodgers'] / sum(nat_values)
p_dodgers

In [None]:
p_dodgers_and_p_rays = 1/20
p_dodgers_and_p_rays

In [None]:
p_rays_given_dodgers = 1 / nat_dict['Dodgers']
p_rays_given_dodgers

In [None]:
p_dodgers_given_rays = 1 / am_dict['Rays'] 
p_dodgers_given_rays

In [None]:
sum([-s*math.log2(s) for s in [x/sum(am_values) for x in am_values]])
#Entropy for American Leauge. 

In [None]:
2**3.1841837197791887
#This means that there are effectively like 10 teams in the American League. Not all teams have a chance to advance to WS.

In [None]:
sum([-s*math.log2(s) for s in [x/sum(nat_values) for x in nat_values]])

In [None]:
2**3.1841837197791882
#This means that there effectively like 10 teams in the National League.

### Joint Entropy

\begin{equation}
H(X=x, Y=y) = - \sum_{x} \sum_{y} \mathbb{P}(X=x, Y=y) \cdot log_{2} \cdot \mathbb{P}(X=x, Y=y) \\
H(X=x, Y=y) = H(Y=y, X=x)\\
H(X=x, Y=y) = H(X=x) + H(Y=y) \; \textrm{if} \; X \perp \!\!\! \perp Y
\end{equation}

In [None]:
dict(Counter(df['combined']))

In [None]:
joint_list = list(dict(Counter(df['combined'].to_list())).values()) 
sum(joint_list)

In [None]:
#Joint Entropy for 20 years of past World Series games
print('{:.3f} bits'.format(sum([-p*math.log2(p) for p in [each/sum(joint_list) for each in joint_list]])))

In [None]:
#Amount of information if the past 20 WS games were all unique (joint entropy).
sum([-p*math.log2(p) for p in [1/20 for each in range(20)]])

In [None]:
gameList = list(dict(Counter(df['combined'])).values())
gameTotal = sum(gameList)
gameListProba = [p/gameTotal for p in gameList]
print('The Conditional Entropy is: {:.3f} bits'.format(sum([-p*math.log2(p) for p in gameListProba])))

In [None]:
# Conditional Entropy if all w2 games where unique.
print('The Conditional Entropy is: {:.3f} bits'.format(math.log2(20)))

#Since 4.222 < 4.322, then the fact that there is a duplicate game means that the entropy went
# down due to the fact that there is a level of predictability in the system.

In [1]:
import pandas as pd
import math
import numpy as np
from collections import Counter

df = pd.read_csv('https://raw.githubusercontent.com/grantaguinaldo/meetup/master/world-series.csv')

df.rename(columns={'X (American)': 'x_american', 'Y (National)': 'y_national', 'Year': 'year'}, inplace=True)
df['combined'] = df['x_american'] + ', ' + df['y_national']
df

Unnamed: 0,year,x_american,y_national,combined
0,2001,Yankees,Diamondbacks,"Yankees, Diamondbacks"
1,2002,Angels,Giants,"Angels, Giants"
2,2003,Yankees,Marlins,"Yankees, Marlins"
3,2004,Red Sox,Cardinals,"Red Sox, Cardinals"
4,2005,White Sox,Astros,"White Sox, Astros"
5,2006,Tigers,Cardinals,"Tigers, Cardinals"
6,2007,Red Sox,Rockies,"Red Sox, Rockies"
7,2008,Rays,Phillies,"Rays, Phillies"
8,2009,Yankees,Phillies,"Yankees, Phillies"
9,2010,Rangers,Giants,"Rangers, Giants"


### Conditional Entropy

\begin{equation}
H(Y = y | X = x) = \sum_{x} \cdot \mathbb{P}(X=x) \cdot H(Y = y | X = x) \\
= - \sum_{x} \sum_{y} \cdot \mathbb{P}(Y=y, X=x) \cdot log_{2} \cdot \mathbb{P}(Y=y | X=x)
\end{equation}

Note that for a conditional probability:
\begin{equation}
\mathbb{P}(Y=y | X=x) = \frac{\mathbb{P}(Y \cap X)}{\mathbb{P}(X)}
\end{equation}

Notes:

If $Y \perp \!\!\! \perp X$, then:
\begin{equation}
H(Y=y | X=x) = H(Y=y)
\end{equation}


If $Y$ completely depends on $X$, then:
\begin{equation}
H(Y=y | X=x) = 0
\end{equation}
This means it that if $Y$ and $X$ are dependent, one does not receive any information on $Y$ if they learn something about $X$.

In [2]:
am_dict = dict(Counter(df.x_american.to_list()))
nat_dict = dict(Counter(df.y_national.to_list()))

In [3]:
alTeamFreq = dict(zip(list(am_dict.keys()), [each/20 for each in list(am_dict.values())]))
alTeamFreq

{'Yankees': 0.15,
 'Angels': 0.05,
 'Red Sox': 0.2,
 'White Sox': 0.05,
 'Tigers': 0.1,
 'Rays': 0.1,
 'Rangers': 0.1,
 'Royals': 0.1,
 'Indians': 0.05,
 'Astros': 0.1}

In [4]:
nlTeamFreq = dict(zip(list(nat_dict.keys()), [each/20 for each in list(nat_dict.values())]))
nlTeamFreq

{'Diamondbacks': 0.05,
 'Giants': 0.2,
 'Marlins': 0.05,
 'Cardinals': 0.2,
 'Astros': 0.05,
 'Rockies': 0.05,
 'Phillies': 0.1,
 'Mets': 0.05,
 'Cubs': 0.05,
 'Dodgers': 0.15,
 'Nationals': 0.05}

In [5]:
wsTeams = dict(Counter(df['combined']))
wsTeams #Calculate Joint Prob of each game using the marginals.

{'Yankees, Diamondbacks': 1,
 'Angels, Giants': 1,
 'Yankees, Marlins': 1,
 'Red Sox, Cardinals': 2,
 'White Sox, Astros': 1,
 'Tigers, Cardinals': 1,
 'Red Sox, Rockies': 1,
 'Rays, Phillies': 1,
 'Yankees, Phillies': 1,
 'Rangers, Giants': 1,
 'Rangers, Cardinals': 1,
 'Tigers, Giants': 1,
 'Royals, Giants': 1,
 'Royals, Mets': 1,
 'Indians, Cubs': 1,
 'Astros, Dodgers': 1,
 'Red Sox, Dodgers': 1,
 'Astros, Nationals': 1,
 'Rays, Dodgers': 1}

In [6]:
df_al_marginals = pd.DataFrame({'team': list(alTeamFreq.keys()), 'marginal_al': list(alTeamFreq.values())})
df_al_marginals

Unnamed: 0,team,marginal_al
0,Yankees,0.15
1,Angels,0.05
2,Red Sox,0.2
3,White Sox,0.05
4,Tigers,0.1
5,Rays,0.1
6,Rangers,0.1
7,Royals,0.1
8,Indians,0.05
9,Astros,0.1


In [7]:
df_nl_marginals = pd.DataFrame({'team': list(nlTeamFreq.keys()), 'marginal_nl': list(nlTeamFreq.values())})
df_nl_marginals

Unnamed: 0,team,marginal_nl
0,Diamondbacks,0.05
1,Giants,0.2
2,Marlins,0.05
3,Cardinals,0.2
4,Astros,0.05
5,Rockies,0.05
6,Phillies,0.1
7,Mets,0.05
8,Cubs,0.05
9,Dodgers,0.15


In [8]:
df = pd.merge(df, 
         df_nl_marginals, 
         how='inner', 
         left_on='y_national', 
         right_on='team')

In [9]:
df = pd.merge(df, df_al_marginals,
        how='inner', 
        left_on = 'x_american', 
        right_on='team')

In [10]:
df['joint'] = df['marginal_nl'] * df['marginal_al']
df

Unnamed: 0,year,x_american,y_national,combined,team_x,marginal_nl,team_y,marginal_al,joint
0,2001,Yankees,Diamondbacks,"Yankees, Diamondbacks",Diamondbacks,0.05,Yankees,0.15,0.0075
1,2003,Yankees,Marlins,"Yankees, Marlins",Marlins,0.05,Yankees,0.15,0.0075
2,2009,Yankees,Phillies,"Yankees, Phillies",Phillies,0.1,Yankees,0.15,0.015
3,2002,Angels,Giants,"Angels, Giants",Giants,0.2,Angels,0.05,0.01
4,2010,Rangers,Giants,"Rangers, Giants",Giants,0.2,Rangers,0.1,0.02
5,2011,Rangers,Cardinals,"Rangers, Cardinals",Cardinals,0.2,Rangers,0.1,0.02
6,2012,Tigers,Giants,"Tigers, Giants",Giants,0.2,Tigers,0.1,0.02
7,2006,Tigers,Cardinals,"Tigers, Cardinals",Cardinals,0.2,Tigers,0.1,0.02
8,2014,Royals,Giants,"Royals, Giants",Giants,0.2,Royals,0.1,0.02
9,2015,Royals,Mets,"Royals, Mets",Mets,0.05,Royals,0.1,0.005


In [11]:
df = df.sort_values(by='year', ascending=True).reset_index(drop=True)
df

Unnamed: 0,year,x_american,y_national,combined,team_x,marginal_nl,team_y,marginal_al,joint
0,2001,Yankees,Diamondbacks,"Yankees, Diamondbacks",Diamondbacks,0.05,Yankees,0.15,0.0075
1,2002,Angels,Giants,"Angels, Giants",Giants,0.2,Angels,0.05,0.01
2,2003,Yankees,Marlins,"Yankees, Marlins",Marlins,0.05,Yankees,0.15,0.0075
3,2004,Red Sox,Cardinals,"Red Sox, Cardinals",Cardinals,0.2,Red Sox,0.2,0.04
4,2005,White Sox,Astros,"White Sox, Astros",Astros,0.05,White Sox,0.05,0.0025
5,2006,Tigers,Cardinals,"Tigers, Cardinals",Cardinals,0.2,Tigers,0.1,0.02
6,2007,Red Sox,Rockies,"Red Sox, Rockies",Rockies,0.05,Red Sox,0.2,0.01
7,2008,Rays,Phillies,"Rays, Phillies",Phillies,0.1,Rays,0.1,0.01
8,2009,Yankees,Phillies,"Yankees, Phillies",Phillies,0.1,Yankees,0.15,0.015
9,2010,Rangers,Giants,"Rangers, Giants",Giants,0.2,Rangers,0.1,0.02


In [12]:
df['prob_cond_nl'] = df['joint'] / df['marginal_nl']
df

Unnamed: 0,year,x_american,y_national,combined,team_x,marginal_nl,team_y,marginal_al,joint,prob_cond_nl
0,2001,Yankees,Diamondbacks,"Yankees, Diamondbacks",Diamondbacks,0.05,Yankees,0.15,0.0075,0.15
1,2002,Angels,Giants,"Angels, Giants",Giants,0.2,Angels,0.05,0.01,0.05
2,2003,Yankees,Marlins,"Yankees, Marlins",Marlins,0.05,Yankees,0.15,0.0075,0.15
3,2004,Red Sox,Cardinals,"Red Sox, Cardinals",Cardinals,0.2,Red Sox,0.2,0.04,0.2
4,2005,White Sox,Astros,"White Sox, Astros",Astros,0.05,White Sox,0.05,0.0025,0.05
5,2006,Tigers,Cardinals,"Tigers, Cardinals",Cardinals,0.2,Tigers,0.1,0.02,0.1
6,2007,Red Sox,Rockies,"Red Sox, Rockies",Rockies,0.05,Red Sox,0.2,0.01,0.2
7,2008,Rays,Phillies,"Rays, Phillies",Phillies,0.1,Rays,0.1,0.01,0.1
8,2009,Yankees,Phillies,"Yankees, Phillies",Phillies,0.1,Yankees,0.15,0.015,0.15
9,2010,Rangers,Giants,"Rangers, Giants",Giants,0.2,Rangers,0.1,0.02,0.1


In [13]:
df['info_elements'] = -1 * df['joint']  * np.log2(df['prob_cond_nl'])

In [14]:
mask = ['year', 'x_american', 'y_national', 'marginal_nl', 'marginal_al', 'joint', 'prob_cond_nl', 'info_elements']
df = df[mask]

In [15]:
df

Unnamed: 0,year,x_american,y_national,marginal_nl,marginal_al,joint,prob_cond_nl,info_elements
0,2001,Yankees,Diamondbacks,0.05,0.15,0.0075,0.15,0.020527
1,2002,Angels,Giants,0.2,0.05,0.01,0.05,0.043219
2,2003,Yankees,Marlins,0.05,0.15,0.0075,0.15,0.020527
3,2004,Red Sox,Cardinals,0.2,0.2,0.04,0.2,0.092877
4,2005,White Sox,Astros,0.05,0.05,0.0025,0.05,0.010805
5,2006,Tigers,Cardinals,0.2,0.1,0.02,0.1,0.066439
6,2007,Red Sox,Rockies,0.05,0.2,0.01,0.2,0.023219
7,2008,Rays,Phillies,0.1,0.1,0.01,0.1,0.033219
8,2009,Yankees,Phillies,0.1,0.15,0.015,0.15,0.041054
9,2010,Rangers,Giants,0.2,0.1,0.02,0.1,0.066439


In [19]:
#df.to_csv('cond-entropy.csv', index=False)

In [16]:
print('The Conditional Entropy is: {:.3f} bits'.format(df.info_elements.sum(axis=0)))

The Conditional Entropy is: 0.924 bits


In [17]:
df.sort_values(by='info_elements', ascending=False)

Unnamed: 0,year,x_american,y_national,marginal_nl,marginal_al,joint,prob_cond_nl,info_elements
3,2004,Red Sox,Cardinals,0.2,0.2,0.04,0.2,0.092877
12,2013,Red Sox,Cardinals,0.2,0.2,0.04,0.2,0.092877
17,2018,Red Sox,Dodgers,0.15,0.2,0.03,0.2,0.069658
10,2011,Rangers,Cardinals,0.2,0.1,0.02,0.1,0.066439
9,2010,Rangers,Giants,0.2,0.1,0.02,0.1,0.066439
5,2006,Tigers,Cardinals,0.2,0.1,0.02,0.1,0.066439
13,2014,Royals,Giants,0.2,0.1,0.02,0.1,0.066439
11,2012,Tigers,Giants,0.2,0.1,0.02,0.1,0.066439
16,2017,Astros,Dodgers,0.15,0.1,0.015,0.1,0.049829
19,2020,Rays,Dodgers,0.15,0.1,0.015,0.1,0.049829
