In [1]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd

# Joint distributions and independence on real data

## Let's start using data frames with pandas

In [94]:
# upload the data on colab
# you need to go on the folder icon on the left side and upload the two files that you found on moodle: XY_pasta and XY_smart
# import the data using pd.read_csv
# caveat: when you create csv files, it matters which separator you are using for columns and rows
# (italian and UK/US system work differently,
# Colab goes on the US one so it uses comma separated values (the default for CSV))
# so if you open this file with Excel on an italian computer it will not look as you expect
XY = pd.read_csv('XY_pasta.csv', header=0, index_col=0)

In [115]:
# the data represent the joint probability density of X, Y
# where X is random variable indicating what's the favourite roman pasta
# among the people of Rome
# and Y represents if the person comes from Rome North or South
XY

Unnamed: 0,North Rome,South Rome
carbonara,0.1125,0.1375
amatriciana,0.135,0.165
gricia,0.09,0.11
cacio e pepe,0.1125,0.1375


In [97]:
# we can extract columns (variables)
XY['North Rome']

Unnamed: 0,North Rome
carbonara,0.1125
amatriciana,0.135
gricia,0.09
cacio e pepe,0.1125


In [113]:
# let's extract one of the values
nr = XY['North Rome']
nr.iloc[0]

0.1125

In [98]:
# for pandas, rows represent different observations, so they are the index (it is not really the case here, but it does not matter!)
# to extract the value of a row we can use .loc[name of row]
# since carbonara is the first row, we can equivalently us .iloc[0]
XY.loc['carbonara']
# XY.iloc[0]

Unnamed: 0,carbonara
North Rome,0.1125
South Rome,0.1375


In [118]:
# we can perform operations on
XY.loc['carbonara'] + 1

Unnamed: 0,carbonara
North Rome,1.1125
South Rome,1.1375


In [117]:
# if we prefer to work with arrays
XY.loc['carbonara'].to_numpy() + 1

array([1.1125, 1.1375])

In [102]:
# to extract an element
XY.loc['carbonara', 'North Rome']

0.1125

In [114]:
# filter more rows
XY.loc[['carbonara', 'amatriciana']]

Unnamed: 0,North Rome,South Rome
carbonara,0.1125,0.1375
amatriciana,0.135,0.165


In [103]:
# find out if X and Y are independent
# we need to compute the marginals
y_marginal = XY.sum(axis=0)
x_marginal = XY.sum(axis=1)

In [104]:
# and then find f_X * f_Y
x_marginal_y_marginal = np.outer(x_marginal, y_marginal)

In [106]:
# check if f_X * f_Y and the joint given by XY are equal (up to numerical error)
np.round(XY - x_marginal_y_marginal, 5)

Unnamed: 0,North Rome,South Rome
carbonara,0.0,0.0
amatriciana,-0.0,-0.0
gricia,-0.0,-0.0
cacio e pepe,0.0,0.0


In [None]:
# independent!

# Check the same for being the owner of a Smart and being from Rome North or South

In [3]:
XY_smart = pd.read_csv('XY_smart.csv', header=0, index_col=0)

# find out if X and Y are independent
y_marginal = XY_smart.sum(axis=0)
x_marginal = XY_smart.sum(axis=1)

In [4]:
x_marginal_y_marginal = np.outer(x_marginal, y_marginal)
print(np.round(XY_smart - x_marginal_y_marginal, 5))

          North Rome  South Rome
Smart           0.12       -0.12
No Smart       -0.12        0.12


In [None]:
# not independent!