# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [1]:
import pandas as pd

In [2]:
# Load and view first few lines of dataset
df = pd.read_csv(r'admission_data.csv')
df.head(5)

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


### Proportion and admission rate for each gender

In [3]:
# Proportion of students that are female
df_female = df[df['gender']=='female']
proportion_female = df_female.gender.count() / df.gender.count()
proportion_female

0.514

In [4]:
# Proportion of students that are male
df_male = df[df['gender']=='male']
proportion_male = df_male.gender.count() / df.gender.count()
proportion_male

0.486

#### By looking at the admission rates, it seems that the **males** are being favored in the admission process.

In [13]:
# Admission rate for females
total_female_applied = df_female.admitted.count()
total_female_admitted_table = df_female[df_female['admitted']== True]
total_female_admitted = total_female_admitted_table.admitted.sum()
rate_female = total_female_admitted / total_female_applied
rate_female

0.28793774319066145

In [6]:
# Admission rate for males
total_male_applied = df_male.admitted.count()
total_male_admitted_table = df_male[df_male['admitted']== True]
total_male_admitted = total_male_admitted_table.admitted.sum()
rate_male = total_male_admitted / total_male_applied
rate_male

0.48559670781893005

### Proportion and admission rate for physics majors of each gender

In [7]:
# What proportion of female students are majoring in physics?
df_female_physics = df_female[df_female['major']=='Physics']
females_studying_physics = df_female_physics.major.count()
female_proportion_physics =  females_studying_physics / df_female.major.count()
female_proportion_physics

0.12062256809338522

In [8]:
# What proportion of male students are majoring in physics?
df_male_physics = df_male[df_male['major']=='Physics']
males_studying_physics = df_male_physics.major.count()
male_proportion_physics =  males_studying_physics / df_male.major.count()
male_proportion_physics

0.9259259259259259

In [18]:
# Admission rate for female physics majors
total_female_applied_physics_table = df_female[df_female['major']== 'Physics']
total_female_applied_physics = total_female_applied_physics_table.major.count()
total_female_admitted_physics_table = total_female_admitted_table[total_female_admitted_table['major']== 'Physics']
total_female_admitted_physics = total_female_admitted_physics_table.admitted.count()
female_rate_physics = total_female_admitted_physics / total_female_applied_physics
female_rate_physics

0.7419354838709677

In [19]:
# Admission rate for male physics majors
total_male_applied_physics_table = df_male[df_male['major']== 'Physics']
total_male_applied_physics = total_male_applied_physics_table.major.count()
total_male_admitted_physics_table = total_male_admitted_table[total_male_admitted_table['major']== 'Physics']
total_male_admitted_physics = total_male_admitted_physics_table.admitted.count()
male_rate_physics = total_male_admitted_physics / total_male_applied_physics
male_rate_physics

0.5155555555555555

#### This is where the paradox occurs, cause even though we said earlier that we favored males over females, we can see in here that the females are applied more in physics

### Proportion and admission rate for chemistry majors of each gender

In [21]:
# What proportion of female students are majoring in chemistry?
df_female_chem = df_female[df_female['major']=='Chemistry']
females_studying_chem = df_female_chem.major.count()
female_proportion_chem =  females_studying_chem / df_female.major.count()
female_proportion_chem

0.8793774319066148

In [22]:
# What proportion of male students are majoring in chemistry?
df_male_chem = df_male[df_male['major']=='Chemistry']
males_studying_chem = df_male_chem.major.count()
male_proportion_chem =  males_studying_chem / df_male.major.count()
male_proportion_chem

0.07407407407407407

In [20]:
# Admission rate for female chemistry majors
total_female_applied_chem_table = df_female[df_female['major']== 'Chemistry']
total_female_applied_chem = total_female_applied_chem_table.major.count()
total_female_admitted_chem_table = total_female_admitted_table[total_female_admitted_table['major']== 'Chemistry']
total_female_admitted_chem = total_female_admitted_chem_table.admitted.count()
female_rate_chem = total_female_admitted_chem / total_female_applied_chem
female_rate_chem

0.22566371681415928

In [23]:
# Admission rate for male chemistry majors
total_male_applied_chem_table = df_male[df_male['major']== 'Chemistry']
total_male_applied_chem = total_male_applied_chem_table.major.count()
total_male_admitted_chem_table = total_male_admitted_table[total_male_admitted_table['major']== 'Chemistry']
total_male_admitted_chem = total_male_admitted_chem_table.admitted.count()
male_rate_chem = total_male_admitted_chem / total_male_applied_chem
male_rate_chem

0.1111111111111111

#### In here, it happens again, we can see that again, the females are being favored over the males...

### Admission rate for each major

In [31]:
# Admission rate for physics majors
df_physics_applied_table = df[df['major']=='Physics']
df_physics_admitted_table = df[(df['admitted']== True) & (df['major'] == 'Physics')]
rate_physics = df_physics_admitted_table.admitted.count() / df_physics_applied_table.admitted.count()
rate_physics

0.54296875

In [32]:
# Admission rate for chemistry majors
df_chem_applied_table = df[df['major']=='Chemistry']
df_chem_admitted_table = df[(df['admitted']== True) & (df['major'] == 'Chemistry')]
rate_chem = df_chem_admitted_table.admitted.count() / df_chem_applied_table.admitted.count()
rate_chem

0.21721311475409835