In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import entropy

Reminder: If the particles inside a system have many possible positions to move around, then the system has high entropy, and if they have to stay rigid, then the system has low entropy.

Similarly, random variables taking on several possible values, tend to have higher information, whereas variables biased towards a single value, provide little information.


# Monedas

In [None]:
# Considera una moneda con probabilidades iguales de caer cara o cruz, i.e, 50%
# Calcula su entropía
H = -0.5 * np.log2(0.5) - 0.5 * np.log2(0.5)
print(f"Entropy of [0.5, 0.5] -> H = {H:.3f} bit")

Entropy of [0.5, 0.5] -> H = 1.000 bit


In [None]:
# ¿Cuál sería el valor de la entropía en una moneda cargada, cuyas
# probabilidades de cara y cruz fueran 0.9 y 0.1, respectivamente?
H =   # COMPLETA AQUÍ
print(f"Entropy of [0.1, 0.9] -> H = {H:.3f} bits")

Entropy of [0.1, 0.9] -> H = 0.469 bits


In [None]:
# Revisa para diferentes combinaciones de probabilidades para una variable binaria
for p in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
  if p == 0 or p == 1.0:
    H = 0.0
  else:
    H = -p * np.log2(p) - (1-p) * np.log2(1-p)

  print(f"Entropy of [{p:0.1f}, {1-p:0.1f}] -> H = {H:.3f} bit(s)")

Entropy of [0.0, 1.0] -> H = 0.000 bit(s)
Entropy of [0.1, 0.9] -> H = 0.469 bit(s)
Entropy of [0.2, 0.8] -> H = 0.722 bit(s)
Entropy of [0.3, 0.7] -> H = 0.881 bit(s)
Entropy of [0.4, 0.6] -> H = 0.971 bit(s)
Entropy of [0.5, 0.5] -> H = 1.000 bit(s)
Entropy of [0.6, 0.4] -> H = 0.971 bit(s)
Entropy of [0.7, 0.3] -> H = 0.881 bit(s)
Entropy of [0.8, 0.2] -> H = 0.722 bit(s)
Entropy of [0.9, 0.1] -> H = 0.469 bit(s)
Entropy of [1.0, 0.0] -> H = 0.000 bit(s)


In [None]:
# ¿Cuál de los dos casos anteriores provee mayor información, el de la moneda
# balanceada o desbalanceada? ¿Por qué?

# Dado

In [None]:
# Considera un dado bien balanceado, es decir, para el cual todas sus caras
# tiene la misma probabilidad. Calcula su entropía.
H =     # COMPLETA AQUÍ
print(f"Entropy of fair dice is -> H = {H:.3f} bits")

Entropy of fair dice is -> H = 2.585 bits


In [None]:
# Ahora considera un dado cargado, el cual ha sido lanzado 1000 veces, y ha
# generado los siguientes resultados
# 1: 102; 2: 296; 3: 294; 4: 104; 5: 103; 6: 101.

# Vector n con conteos de resultados
n = np.array([102, 296, 294, 104, 103, 101])
print(n)
print(n.sum())

[102 296 294 104 103 101]
1000


In [None]:
# En un vector p, indica la probabilidad de cada variable de salida (pdf)
p =      # COMPLETA AQUÍ
print("pdf: ", p)   # tendría que imprimir un vector de probabilidades
print("Sum of pdf = ", p.sum()) # tendría que imprimir 1.0

pdf:  [0.102 0.296 0.294 0.104 0.103 0.101]
Sum of pdf =  0.9999999999999999


In [None]:
# Ahora calcula su entropía
H =      # COMPLETA AQUÍ
print(f"{H:.3f}")

2.386


In [None]:
# ¿Cuál de los dos dados anteriores provee mayor información? ¿Por qué?

# Base de datos

In [None]:
# Load Titanic dataset
df = pd.read_csv('train-titanic.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Get rid of useless and noisy variables
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [None]:
# Convert some variables
df["Sex"] = df["Sex"].astype('category').cat.codes
df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)
df.Age.fillna(df.Age.mean(), inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1,0,7.2500,S
1,1,1,0,38.000000,1,0,71.2833,C
2,1,3,0,26.000000,0,0,7.9250,S
3,1,1,0,35.000000,1,0,53.1000,S
4,0,3,1,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,S
887,1,1,0,19.000000,0,0,30.0000,S
888,0,3,0,29.699118,1,2,23.4500,S
889,1,1,1,26.000000,0,0,30.0000,C


# Entropía de variables independientes

In [None]:
# Veamos los valores únicos de Pclass y sus contenos
print(df.Pclass.value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64


In [None]:
# Extraemos los conteos y los convertimos en distribución
n = df.Pclass.value_counts().to_numpy()
p = n / n.sum()
print(p)

[0.55106622 0.24242424 0.20650954]


In [None]:
# Calculemos su entropía
Hclass = -sum(p * np.log2(p))
print(f"Entropy of variable Pclass is -> H = {Hclass:.3f} bits")

Entropy of variable Pclass is -> H = 1.439 bits


In [None]:
# Calcula la entropía para todas las variables independientes discretas

# Encuentra la manera de quantizar (discretizar) las variables continuas

In [None]:
# Hints:
# - No necesariamente existe una manera única de quantizarlas.
# - Cada variable puede ser quantizada dependiendo la naturaleza de la variable,
# por ejemplo, Edad puede ser: niños, adolescentes, adultos, ancianos.
# - Algunas funciones que podrían ayudar son np.histogram o pd.qcut

In [None]:
# Cuantizar una distribución continua en una discreta es un problema viejo,
# aún abierto. Opciones:
#
# Number N of bins:
# N = ceiling((max(x) - min(x)) / (2 * Q * n^(-1/3))); Q: dist 75- 25-quantiles, n: #points
# N = ceiling((max(x) - min(x)) / (3.5 * s * n^(-1/3))); s: stddev
# N = ceiling(1 + log_2(n))

In [None]:
# Set intervals for variable 'Age'
bins=[0, 3, 12, 18, 60, 100]
labels=['Baby', 'Child', 'Teen', 'Adult', 'Older']
df['Age'] = pd.cut(df['Age'], bins, labels=labels)

df.Age.value_counts()

Adult    730
Teen      70
Child     39
Baby      30
Older     22
Name: Age, dtype: int64

In [None]:
# Set intervals for variable 'Fare'
df['Fare'] = pd.qcut(x=df['Fare'], q=10)
df.Fare.value_counts()

(7.854, 8.05]        106
(-0.001, 7.55]        92
(27.0, 39.688]        91
(21.679, 27.0]        89
(39.688, 77.958]      89
(14.454, 21.679]      88
(7.55, 7.854]         87
(77.958, 512.329]     87
(10.5, 14.454]        84
(8.05, 10.5]          78
Name: Fare, dtype: int64

In [None]:
# See final vesion of dataframe
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,Adult,1,0,"(-0.001, 7.55]",S
1,1,1,0,Adult,1,0,"(39.688, 77.958]",C
2,1,3,0,Adult,0,0,"(7.854, 8.05]",S
3,1,1,0,Adult,1,0,"(39.688, 77.958]",S
4,0,3,1,Adult,0,0,"(7.854, 8.05]",S
...,...,...,...,...,...,...,...,...
886,0,2,1,Adult,0,0,"(10.5, 14.454]",S
887,1,1,0,Adult,0,0,"(27.0, 39.688]",S
888,0,3,0,Adult,1,2,"(21.679, 27.0]",S
889,1,1,1,Adult,0,0,"(27.0, 39.688]",C


In [None]:
# Define a function to compute entropy
def entropy_func(x):
  n = df[x].value_counts().to_numpy()
  p = n / n.sum()
  return -sum(p * np.log2(p))

In [None]:
# Compute entropy for each variable
H = [entropy_func(x) for x in df.columns]
df_H = pd.DataFrame(data=H, index=df.columns, columns=['Entropy'])
df_H

Unnamed: 0,Entropy
Survived,0.960708
Pclass,1.439321
Sex,0.936205
Age,1.018054
SibSp,1.338559
Parch,1.128337
Fare,3.317873
Embarked,1.09545


# Ordena las variables en función de su entropía, de mayor a menor.

In [None]:
# Muestra las variables ordenadas
df_H.sort_values(by=['Entropy'], inplace=True)
df_H

Unnamed: 0,Entropy
Sex,0.936205
Survived,0.960708
Age,1.018054
Embarked,1.09545
Parch,1.128337
SibSp,1.338559
Pclass,1.439321
Fare,3.317873


# Información mutua (variable clase vs variable objetivo)

In [None]:
# Distribución sobre la variable objetivo (survived)
y_var = 'Survived'
n_y = df[y_var].value_counts().to_numpy()
p_y = n_y / n_y.sum()
print(p_y)

[0.61616162 0.38383838]


In [None]:
# Distribución sobre la variable independiente (clase)
x_var = 'Pclass'
n_x = df[x_var].value_counts().to_numpy()
p_x = n_x / n_x.sum()
print(p_x)

[0.55106622 0.24242424 0.20650954]


In [None]:
# Conteos conjuntos
N = np.zeros((len(df[x_var].unique()), len(df[y_var].unique())))
for indx, x in enumerate(df[x_var].unique()):
  for indy, y in enumerate(df[y_var].unique()):
    N[indx, indy] = len(df[(df[x_var] == x) & (df[y_var] == y)])

print(N)
print(N.sum())

[[372. 119.]
 [ 80. 136.]
 [ 97.  87.]]
891.0


In [None]:
# Probabilidades conjuntas
P = N / N.sum()
print(P)

[[0.41750842 0.1335578 ]
 [0.08978676 0.15263749]
 [0.10886644 0.0976431 ]]


In [None]:
# Verifica que las marginales coincidan
print(P.sum(axis=0))
print(P.sum(axis=1))

[0.61616162 0.38383838]
[0.55106622 0.24242424 0.20650954]


In [None]:
# Define a function to compute the mutual surprise
def mutual_surprise(pxy, px, py):
  if pxy == 0 or px == 0 or py == 0:
    return 0
  else:
    return pxy * np.log2(pxy / (px * py))

In [None]:
# Compute mutual information
I = 0
for x_it in range(P.shape[0]):
  for y_it in range(P.shape[1]):
    I += mutual_surprise(P[x_it, y_it], p_x[x_it], p_y[y_it])

print(f"The mutual information between {y_var} and {x_var} is -> I = {I:.3f} bits")

The mutual information between Survived and Pclass is -> I = 0.084 bits


In [None]:
# Calcula la información mútua entre la variable objetivo y cada una de las
# variables independientes.

In [None]:
# Function to compute mutual information
def mutual_information_func(x_var, y_var='Survived'):
  N = np.zeros((len(df[x_var].unique()), len(df[y_var].unique())))
  for indx, x in enumerate(df[x_var].unique()):
    for indy, y in enumerate(df[y_var].unique()):
      N[indx, indy] = len(df[(df[x_var] == x) & (df[y_var] == )])

  P = N / N.sum()
  p_y = P.sum(axis=0)
  p_x = P.sum(axis=1)

  I = 0
  for x_it in range(P.shape[0]):
    for y_it in range(P.shape[1]):
      I += mutual_surprise(P[x_it, y_it], p_x[x_it], p_y[y_it])

  return I

In [None]:
# Compute the mutual information between each independent variable and the target variable
I = [mutual_information_func(x, 'Survived') for x in df.columns]
df_I = pd.DataFrame(data=I, index=df.columns, columns=['MI'])
df_I

Unnamed: 0,MI
Survived,0.960708
Pclass,0.083831
Sex,0.21766
Age,0.013371
SibSp,0.033466
Parch,0.023611
Fare,0.100648
Embarked,0.020534


# Ordena las variables independientes en función de su información mutua, en relación con la variable objetivo, de mayor a menor.

In [None]:
# Muestra las variables ordenadas
df_I.sort_values(by=['MI'], inplace=True)
df_I

Unnamed: 0,MI
Age,0.013371
Embarked,0.020534
Parch,0.023611
SibSp,0.033466
Pclass,0.083831
Fare,0.100648
Sex,0.21766
Survived,0.960708
