## Conditional Probability using Python Itertools

In [1]:
#Python code to study Conditional Probability of Events which are given as subsets of sample space
# Introduce the sample space which is the set of all outcomes
#Example : Toss a coin a several times

from itertools import product

In [2]:
#Cartesian product of two sets with default repeat=1
list(product([1,2], ['a', 'b', 'c']))

[(1, 'a'), (1, 'b'), (1, 'c'), (2, 'a'), (2, 'b'), (2, 'c')]

In [19]:
list(product([1,2], ['a', 'b'], repeat=2))

[(1, 'a', 1, 'a'),
 (1, 'a', 1, 'b'),
 (1, 'a', 2, 'a'),
 (1, 'a', 2, 'b'),
 (1, 'b', 1, 'a'),
 (1, 'b', 1, 'b'),
 (1, 'b', 2, 'a'),
 (1, 'b', 2, 'b'),
 (2, 'a', 1, 'a'),
 (2, 'a', 1, 'b'),
 (2, 'a', 2, 'a'),
 (2, 'a', 2, 'b'),
 (2, 'b', 1, 'a'),
 (2, 'b', 1, 'b'),
 (2, 'b', 2, 'a'),
 (2, 'b', 2, 'b')]

In [9]:
#Example : toss a coin several times  (3 times)
#no of coin tossings n = 3

n = 3 #no of coin tossings

In [10]:
#Create sample space
#Events in sample space are occurence of H and T , with repetition n times
#This creates a cartesian product of several copies (n repetitions) of the same list

omega = set(product(['H','T'], repeat=n)) #Sample space of all the outcomes of coin tossings
omega

{('H', 'H', 'H'),
 ('H', 'H', 'T'),
 ('H', 'T', 'H'),
 ('H', 'T', 'T'),
 ('T', 'H', 'H'),
 ('T', 'H', 'T'),
 ('T', 'T', 'H'),
 ('T', 'T', 'T')}

In [11]:
#All occurences of tossing a coin 3 times are included in the set omega
len(omega)

8

In [13]:
#Let us include several events
#A is the event that in the first tossings we have Tails
#We use set comprehension to define the event A such that first tossing is 'T'

A = {om for om in omega if om[0] =='T'} # first tossings is Tail
A

{('T', 'H', 'H'), ('T', 'H', 'T'), ('T', 'T', 'H'), ('T', 'T', 'T')}

In [14]:
#B is the event that we have exactly two Tails in an occurence of tossing the coin 3 times

B = {om for om in omega if om.count('T') == 2} #Exactly 2 tails
B

{('H', 'T', 'T'), ('T', 'H', 'T'), ('T', 'T', 'H')}

In [15]:
#C is the event that we have exactly three Tails in an occurence of tossing the coin 3 times

C = {om for om in omega if om.count('T') == 3} #Exactly 3 tails
C

{('T', 'T', 'T')}

In [16]:
#D is the event that we have exactly one Tail in an occurence of tossing the coin 3 times

D = {om for om in omega if om.count('T') == 1} #Exactly 1 tail
D

{('H', 'H', 'T'), ('H', 'T', 'H'), ('T', 'H', 'H')}

In [20]:
#Find the Probability of Occurence of an event
#Assume that all Outcomes have the same probability and are not biased
#function to get the probability of occurence of an event
#Probability of Occurence of event X = no of elements in X/no of elements in sample space omega

def prob(X):
    return len(X)/len(omega)

In [21]:
#Find the Conditional Probability of Occurence of event X given that event Y has happened
#Probability of event X given Y has happened = P(X/Y) = P(X and Y)/P(Y) = no elements in X and Y/No of elements in Y

def cond_prob(X,Y):
    return len(X & Y)/len(Y)

In [23]:
prob(A)

0.5

In [24]:
prob(B)

0.375

In [25]:
cond_prob(A,B)

0.6666666666666666

In [26]:
#We observe that probability of occurence of A given B has happened = P(A/B) is 0.666 and
#probability of occurence of A= P(A) is 0.5
#P(A/B) > P(A), which meants A nd B are not independent events
#We can also check if A and B are independent events or not by finding Probaility of their intersection P(A & B)

prob(A & B)

0.25

In [27]:
#Compare probability of A and B , ie P(A & B) with Product of their probabilities, P(A)*P(B)

prob(A) * prob(B)

0.1875

In [28]:
#P(A & B) is not equal to P(A) * P(B), which means A and B are not Independent events

In [29]:
#Function to check if two events are independent or not
def are_indep(X,Y):
    return prob(X & Y) == prob(X) * prob(Y)

In [30]:
are_indep(A,B) # A and B are NOT INDEPENDENT

False

In [32]:
#Example for INDEPENDENT Events

P = {om for om in omega if om[0] == 'H'} #First tossing is a Head
Q = {om for om in omega if om[1] == 'H'} #Second tossing is a Head

In [33]:
print(P)
print(Q)

{('H', 'H', 'T'), ('H', 'T', 'T'), ('H', 'T', 'H'), ('H', 'H', 'H')}
{('H', 'H', 'T'), ('T', 'H', 'T'), ('T', 'H', 'H'), ('H', 'H', 'H')}


In [34]:
are_indep(A,P) # A and P are NOT INDEPENDENT

False

In [36]:
are_indep(A,Q) # A and Q are INDEPENDENT

True

## Conditional Probability using Pandas groupby

In [41]:
import pandas as pd
import numpy as np

In [61]:
#Example
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [62]:
df.groupby(by='Animal')['Max Speed'].value_counts()

Animal  Max Speed
Falcon  370.0        1
        380.0        1
Parrot  24.0         1
        26.0         1
Name: Max Speed, dtype: int64

In [63]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [64]:
df = pd.DataFrame({'A':['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'], 
                   'B':['one', 'one', 'two', 'three','two', 'two', 'one', 'three'], 
                   'C':np.random.randn(8), 
                   'D':np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.096207,-0.093878
1,bar,one,-0.298578,0.168029
2,foo,two,0.147191,0.51117
3,bar,three,-0.358389,0.133577
4,foo,two,0.676798,-0.859751
5,bar,two,-2.171591,2.242258
6,foo,one,0.247083,0.285046
7,foo,three,-0.927893,0.132269


In [65]:
# No of elements in A and B
df.groupby('A')['B'].value_counts()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     two      2
     three    1
Name: B, dtype: int64

In [66]:
#No of elements in A
df.groupby('A')['B'].count()

A
bar    3
foo    5
Name: B, dtype: int64

In [67]:
#Conditional Probability of  'B' given 'A'
df.groupby('A')['B'].value_counts() / df.groupby('A')['B'].count()

A    B    
bar  one      0.333333
     three    0.333333
     two      0.333333
foo  one      0.400000
     two      0.400000
     three    0.200000
Name: B, dtype: float64

In [68]:
#Conditional probability using appy and lambda function

df.groupby('A')['B'].apply(lambda x : x.value_counts()/x.count())

A         
bar  one      0.333333
     three    0.333333
     two      0.333333
foo  one      0.400000
     two      0.400000
     three    0.200000
Name: B, dtype: float64

In [69]:
#Conditional probability using appy and lambda function.....len(x) instad of count()
df.groupby('A')['B'].apply(lambda x : x.value_counts()/len(x))

A         
bar  one      0.333333
     three    0.333333
     two      0.333333
foo  one      0.400000
     two      0.400000
     three    0.200000
Name: B, dtype: float64

## Conditional Probability using Pandas crosstab

In [70]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.096207,-0.093878
1,bar,one,-0.298578,0.168029
2,foo,two,0.147191,0.51117
3,bar,three,-0.358389,0.133577
4,foo,two,0.676798,-0.859751
5,bar,two,-2.171591,2.242258
6,foo,one,0.247083,0.285046
7,foo,three,-0.927893,0.132269


In [82]:
pd.crosstab(index=df.A, columns=df.B)

B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1,1,1
foo,2,1,2


In [83]:
pd.crosstab(index=df.A, columns=df.B, normalize='index') #P(B|A) = n(A & B)/n(A) ---- Here row sum is 1

B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.333333,0.333333,0.333333
foo,0.4,0.2,0.4


In [84]:
pd.crosstab(index=df.A, columns=df.B, normalize='columns') #P(A|B) = n(A & B)/n(B) ---- Here column sum is 1

B,one,three,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.333333,0.5,0.333333
foo,0.666667,0.5,0.666667


In [88]:
df2 = pd.DataFrame({'a':['x', 'x', 'x', 'y', 'y', 'y', 'y', 'z'],
                   'b':['1', '2', '3', '4','5', '1', '2', '3']})
df2

Unnamed: 0,a,b
0,x,1
1,x,2
2,x,3
3,y,4
4,y,5
5,y,1
6,y,2
7,z,3


In [89]:
#pd.crosstab(df2.a, df2.b) returns the frequency table of df.a and df.b 
#with the rows being values of df.a and the columns being values of df.b

pd.crosstab(index=df2.a, columns=df2.b)

b,1,2,3,4,5
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
x,1,1,1,0,0
y,1,1,0,1,1
z,0,0,1,0,0


In [90]:
#We can use the normalize keyword to get the table of conditional probabilities P(a | b)
#This will normalize based on column value,
#or return a DataFrame where the columns represent the conditional probabilities P(a | b=B) for specific values of B
#Notice, the columns sum to 1.

pd.crosstab(df2.a, df2.b, normalize='columns')

b,1,2,3,4,5
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
x,0.5,0.5,0.5,0.0,0.0
y,0.5,0.5,0.0,1.0,1.0
z,0.0,0.0,0.5,0.0,0.0


In [92]:
#If we want to get P(b | a), we should normalize over the rows
#Where the rows represent the conditional probabilities P(b | a=A) for specific values of A.
#Notice, the rows sum to 1.

pd.crosstab(df2.a, df2.b, normalize='index')

b,1,2,3,4,5
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
x,0.333333,0.333333,0.333333,0.0,0.0
y,0.25,0.25,0.0,0.25,0.25
z,0.0,0.0,1.0,0.0,0.0
