# Med School Acceptance vs. Ethnicity

This project does stuff with data so that we can answer questions

1. Import necessary libraries

In [1]:
pip install tabula-py

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import tabula

2. scrape web data tables into a Pandas Dataframe

2a. from webpage: Medical School Acceptance Rates by Race (2020): Does Ethnicity Play a Role?

In [3]:
# get url request
msaccept_byrace_url = "https://www.shemmassianconsulting.com/blog/medical-school-acceptance-rates-by-race"
msaccept_byrace_table_list = pd.read_html(msaccept_byrace_url)

# convert all tables from webpage 

# rates of acceptance and matriculation to U.S. med schools by race/ethnicity, 2019-2020
msaccept_byrace_df = msaccept_byrace_table_list[0]

# average MCAT score and GPAs for applicants to U.S. Med Schools by race/ethnicity, 2019-2020
applicantscores_byrace_df = msaccept_byrace_table_list[1]

2b. from webpage: 2019 FACTS: Applicants and Matriculants data

(will most likely be used to check for confounders)

# Table A-5 Applicants to U.S. Medical Schools by In or Out-of-State Matriculation Status, 2019-2020

In [4]:
# table: A-5 Applicants to U.S. Medical Schools by In or Out-of-State Matriculation Status, 2019-2020
a5_path_in = "./res/datafiles/2019_FACTS_Table_A-5.pdf" 
a5_path_out = "./res/datafiles/2019_FACTS_Table_A-5.csv" 
a5_list = tabula.convert_into(a5_path_in, a5_path_out, pages="all")
a5_df = pd.read_csv(a5_path_out, header=None)
a5_df.head()

Got stderr: Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_181
Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
Mar 08, 2020 7:05:43 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Northeast,Connecticut,615,124,20.2,158,25.7,333,54.1
1,,Delaware,101,0,0.0,38,37.6,63,62.4
2,,District of Columbia,103,19,18.4,28,27.2,56,54.4
3,,Maine,99,0,0.0,41,41.4,58,58.6
4,,Maryland,1174,122,10.4,406,34.6,646,55.0


cleaning A5

In [5]:
# rename columns
a5_df.columns = [
                  'region', 
                  'state', 
                  'applicants', 
                  'Matriculated In-State #', 
                  'Matriculated In-State %', 
                  'Matriculated Out-of-State #', 
                  'Matriculated Out-of-State %', 
                  'Did Not Matriculate to any U.S. Med School #', 
                  'Did Not Matriculate to any U.S. Med School %'
                ]

# get rid of irrelevant columns
a5_df = a5_df.drop(columns=['region'])
a5_df = a5_df.set_index('state')
a5_df.head()

Unnamed: 0_level_0,applicants,Matriculated In-State #,Matriculated In-State %,Matriculated Out-of-State #,Matriculated Out-of-State %,Did Not Matriculate to any U.S. Med School #,Did Not Matriculate to any U.S. Med School %
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Connecticut,615,124,20.2,158,25.7,333,54.1
Delaware,101,0,0.0,38,37.6,63,62.4
District of Columbia,103,19,18.4,28,27.2,56,54.4
Maine,99,0,0.0,41,41.4,58,58.6
Maryland,1174,122,10.4,406,34.6,646,55.0


# Table A-8: Applicants to U.S. Medical Schools by Selected Combinations of Race/Ethnicity and Sex, 2016-2017 through 2019-2020

In [6]:
a8_path_in = "./res/datafiles/2019_FACTS_Table_A-8.pdf" 
a8_path_out = "./res/datafiles/2019_FACTS_Table_A-8.csv" 
a8_list = tabula.convert_into(a8_path_in, a8_path_out, pages="all")
a8_df = pd.read_csv(a8_path_out, header=None)
a8_df.head()

Got stderr: Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_181
Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
Mar 08, 2020 7:05:47 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Applicants,,2016-2017*,,,2017-2018,,,2018-2019,,,2019-2020,
1,Selected Combinations of Race/Ethnicity,Men,Women,Total,Men,Women,Total,Men,Women,Total,Men,Women,Total
2,American Indian or Alaska Native Only,61,66,127,45,55,100,58,51,109,40,49,89
3,"American Indian or Alaska Native, Black or Afr...",14,29,43,6,25,31,13,36,49,13,31,44
4,"American Indian or Alaska Native, White",126,106,232,113,114,227,119,123,242,116,132,248


cleaning table A-8

In [7]:
# drop irrelevant rows
a8_df = a8_df.drop([0, 1])

# rename columns
a8_df.columns = [
    'Race/Ethnicity',
    '2016-2017 Men',
    '2016-2017 Women',
    '2016-2017 Total',
    '2017-2018 Men',
    '2017-2018 Women',
    '2017-2018 Total',
    '2018-2019 Men',
    '2018-2019 Women',
    '2018-2019 Total',
    '2019-2020 Men',
    '2019-2020 Women',
    '2019-2020 Total'
]

# extract only data from 2019-2020's applicant pool
a8_df = a8_df[[
    'Race/Ethnicity',
    '2019-2020 Men',
    '2019-2020 Women',
    '2019-2020 Total',
]]

# set the index to be race/ethnicity
a8_df = a8_df.set_index('Race/Ethnicity')
a8_df.head()

Unnamed: 0_level_0,2019-2020 Men,2019-2020 Women,2019-2020 Total
Race/Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Indian or Alaska Native Only,40,49,89
"American Indian or Alaska Native, Black or African American",13,31,44
"American Indian or Alaska Native, White",116,132,248
Asian Only,5233,5793,11026
"Asian, Black or African American",47,51,98


# Table A-10 Applicants to U.S. Medical Schools by State of Legal Residence, 2010-2011 through 2019-2020

In [8]:
# get dataframe from pdf file 

# table: A-10 applicants to U.S. Medical Schools by State of Legal Residence, 2010-2011 through 2019-2020
a10_path_in = "./res/datafiles/2019_FACTS_Table_A-10.pdf" 
a10_path_out_1 = "./res/datafiles/2019_FACTS_Table_A-10-1.csv" 
a10_path_out_2 = "./res/datafiles/2019_FACTS_Table_A-10-2.csv" 
a10_list_1 = tabula.convert_into(a10_path_in, a10_path_out_1, pages="1")
a10_list_2 = tabula.convert_into(a10_path_in, a10_path_out_2, pages="2")
a10_df_1 = pd.read_csv(a10_path_out_1, header=None)
a10_df_2 = pd.read_csv(a10_path_out_2, header=None)


Got stderr: Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_181
Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
Mar 08, 2020 7:05:49 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")

Got stderr: Mar 08, 2020 7:05:52 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_181
Mar 08, 2020 7:05:52 PM

table A-10 cleaning

In [9]:
# rename columns
a10_df_1.columns = [
                  'region', 
                  'state', 
                  'Native American', 
                  'Asian', 
                  'Black/African American', 
                  'Hispanic/Latinx', 
                  'Native Hawaiian/Pacific Islander', 
                  'White', 
                  'Other', 
                  'Multiple Race/Ethnicity', 
                  'Unknown Race/Ethnicity', 
                  'Non-U.S. Citizen', 
                  'Total'
                ]

# rename columns
a10_df_2.columns = [
                  'state',
                  'region',
                  'Native American', 
                  'Asian', 
                  'Black/African American', 
                  'Hispanic/Latinx', 
                  'Native Hawaiian/Pacific Islander', 
                  'White', 
                  'Other', 
                  'Multiple Race/Ethnicity', 
                  'Unknown Race/Ethnicity', 
                  'Non-U.S. Citizen', 
                  'Total'
                ]

# get rid of irrelevant columns
a10_df_1 = a10_df_1.drop(columns=['region'])
a10_df_2 = a10_df_2.drop(columns=['region'])

a10_df_1 = a10_df_1.set_index('state')
a10_df_2 = a10_df_2.set_index('state')


a10_df = a10_df_1.append(a10_df_2)
a10_df

Unnamed: 0_level_0,Native American,Asian,Black/African American,Hispanic/Latinx,Native Hawaiian/Pacific Islander,White,Other,Multiple Race/Ethnicity,Unknown Race/Ethnicity,Non-U.S. Citizen,Total
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Connecticut,0,118,52,19,0,320,14,43,37,12,615
Delaware,0,24,16,2,0,41,0,17,1,0,101
District of Columbia,0,11,22,7,0,36,1,9,8,9,103
Maine,2,3,2,1,0,76,0,2,12,1,99
Maryland,1,301,258,31,0,420,16,74,48,25,1174
Massachusetts,1,280,92,34,1,641,28,90,85,46,1298
New Hampshire,0,10,1,1,1,85,2,8,8,1,117
New Jersey,1,633,195,91,0,636,66,162,127,23,1934
New York,5,813,396,200,1,1427,96,301,238,56,3533
Pennsylvania,0,311,109,24,0,954,20,108,83,33,1642


# Table A-23: MCAT and GPA Grid for Applicants and Acceptees to U.S. Medical Schools, 2017-2018 through 2019-2020 (aggregated)

In [10]:
# get dataframe from pdf file 

# table: A-10 applicants to U.S. Medical Schools by State of Legal Residence, 2010-2011 through 2019-2020
a23_path_in = "./res/datafiles/2019_FACTS_Table_A-23.pdf" 
a23_path_out = "./res/datafiles/2019_FACTS_Table_A-23.csv" 
a23_list = tabula.convert_into(a23_path_in, a23_path_out, pages="all")
a23_df = pd.read_csv(a23_path_out, header=None)
a23_df.head()

Got stderr: Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: Your current java version is: 1.8.0_181
Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO: To get higher rendering speed on old java 1.8 or 9 versions,
Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),
Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or
Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
Mar 08, 2020 7:05:54 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS
INFO:   or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
Mar 08, 2020 7:05:55 PM org.apache.pdfbox.rendering.TTFGlyph2D getPathForGID



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Greater than 3.79,Acceptees,4.0,9.0,58.0,298.0,1007.0,2603.0,4945.0,6573.0,6312.0,6893.0,28702.0
1,,Applicants,159.0,320.0,726.0,1584.0,3272.0,5435.0,7957.0,8778.0,7711.0,7848.0,43790.0
2,,Acceptance rate %,2.5,2.8,8.0,18.8,30.8,47.9,62.1,74.9,81.9,87.8,65.5
3,3.60-3.79,Acceptees,3.0,8.0,55.0,300.0,1017.0,2181.0,3770.0,4670.0,3526.0,2364.0,17894.0
4,,Applicants,382.0,613.0,1313.0,2492.0,4251.0,6291.0,7565.0,7228.0,4793.0,2924.0,37852.0


Clean table A-23

In [11]:
# rename columns
a23_df.columns = [
    'Total GPA',
    'Groups',
    '< 486', 
    '486-489', 
    '490-493', 
    '494-497', 
    '498-501',
    '502-505',
    '506-509',
    '510-513',
    '514-517',
    '> 517',
    'All Applicants'
    ]

# holds the gpa that each acceptance rate row will represent
gpa = 0

# drop rows that don't have acceptance rate
for row in a23_df.iterrows():
    if row[1]['Groups'] == "Acceptees":
        gpa = row[1]['Total GPA']
        a23_df = a23_df.drop(axis=0, index=int(row[0]))
        
        
    if row[1]['Groups'] == "Applicants":
        a23_df = a23_df.drop(axis=0, index=int(row[0]))
        
    # fill gpa value into the acceptance rate row
    if row[1]['Groups'] == "Acceptance rate %":
        a23_df.at[row[0], 'Total GPA'] = gpa

3. check heads of dataframes to verify they are correct

In [12]:
msaccept_byrace_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...,Rates of Acceptance and Matriculation to U.S. ...
1,Race/Ethnicity,Applicants,Acceptees,% Accepted,Matriculants,% Matriculated,
2,American Indian or Alaska Native,89,46,52%,44,49%,
3,Asian,11027,4848,44%,4687,43%,
4,Black or African American,4419,1685,38%,1627,37%,


In [13]:
applicantscores_byrace_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...,Average MCAT Score and GPAs for Applicants to ...
1,,American Indian or Alaska Native,Asian,Black or African American,"Hispanic, Latino, or of Spanish Origin",Native Hawaiian or Other Pacific Islander Whit...,,,,,,
2,MCAT CPBS,124.7,127.2,124.2,124.9,125.6,126.6,126.2,126.0,126.9,127.0,126.4
3,MCAT CARS,124.4,126.1,123.8,124.3,124.5,126.4,125.2,125.7,126.6,125.4,125.9
4,MCAT BBLS,125.4,127.3,124.4,125.3,126.1,127.0,126.4,126.3,127.1,127.3,126.7


In [14]:
a5_df.head()

Unnamed: 0_level_0,applicants,Matriculated In-State #,Matriculated In-State %,Matriculated Out-of-State #,Matriculated Out-of-State %,Did Not Matriculate to any U.S. Med School #,Did Not Matriculate to any U.S. Med School %
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Connecticut,615,124,20.2,158,25.7,333,54.1
Delaware,101,0,0.0,38,37.6,63,62.4
District of Columbia,103,19,18.4,28,27.2,56,54.4
Maine,99,0,0.0,41,41.4,58,58.6
Maryland,1174,122,10.4,406,34.6,646,55.0


In [15]:
a8_df.head()

Unnamed: 0_level_0,2019-2020 Men,2019-2020 Women,2019-2020 Total
Race/Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Indian or Alaska Native Only,40,49,89
"American Indian or Alaska Native, Black or African American",13,31,44
"American Indian or Alaska Native, White",116,132,248
Asian Only,5233,5793,11026
"Asian, Black or African American",47,51,98


In [16]:
a10_df.head()

Unnamed: 0_level_0,Native American,Asian,Black/African American,Hispanic/Latinx,Native Hawaiian/Pacific Islander,White,Other,Multiple Race/Ethnicity,Unknown Race/Ethnicity,Non-U.S. Citizen,Total
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Connecticut,0,118,52,19,0,320,14,43,37,12,615
Delaware,0,24,16,2,0,41,0,17,1,0,101
District of Columbia,0,11,22,7,0,36,1,9,8,9,103
Maine,2,3,2,1,0,76,0,2,12,1,99
Maryland,1,301,258,31,0,420,16,74,48,25,1174


In [17]:
a23_df.head()

Unnamed: 0,Total GPA,Groups,< 486,486-489,490-493,494-497,498-501,502-505,506-509,510-513,514-517,> 517,All Applicants
2,Greater than 3.79,Acceptance rate %,2.5,2.8,8.0,18.8,30.8,47.9,62.1,74.9,81.9,87.8,65.5
5,3.60-3.79,Acceptance rate %,0.8,1.3,4.2,12.0,23.9,34.7,49.8,64.6,73.6,80.8,47.3
8,3.40-3.59,Acceptance rate %,0.9,1.2,3.3,9.9,18.6,26.5,37.2,50.9,62.2,70.4,32.2
11,3.20-3.39,Acceptance rate %,0.1,0.7,2.4,7.1,15.0,21.5,29.6,39.9,50.8,56.9,21.6
14,3.00-3.19,Acceptance rate %,0.4,0.9,1.7,6.6,14.2,20.6,25.4,33.8,41.2,49.8,16.1
