In [1]:
import pandas as pd
import numpy as np

## Abortion/Birth/Pregnancy Rates over Time:

In [279]:
nat_state = pd.read_csv('Data/NationalAndStatePregnancy_PublicUse.csv') #Source: https://osf.io/duj6a
nat_state['year'] = nat_state['year'].astype(int)
nat_state_years = nat_state[nat_state['year'] >= 2005]
print(nat_state_years.shape)
nat_state_years.head(20)

(832, 104)


Unnamed: 0,state,year,pregnancyratelt15,pregnancyrate1517,pregnancyrate1819,pregnancyrate1519,pregnancyratelt20,pregnancyrate2024,pregnancyrate2529,pregnancyrate3034,...,abortionstotal,birthstotal,pregnancyratetotal,birthratetotal,abortionratetotal,abortionratiototal,miscarriagestotal,pregnanciestotal,notes,versiondate
4,AL,2005,8.7,38.6,118.7,70.8,72.5,165.8,154.8,100.5,...,10840,60453,88.8,63.6,11.4,179.3,13170,84470,,3 Oct 2024
5,AL,2006,9.3,40.3,128.4,75.3,77.1,171.6,159.1,100.8,...,10980,63232,92.0,66.1,11.5,173.7,13740,87960,,3 Oct 2024
6,AL,2007,8.6,41.3,125.3,74.9,76.6,175.1,160.7,104.5,...,11130,64767,93.9,67.6,11.6,171.8,14070,89960,,3 Oct 2024
7,AL,2008,9.7,38.4,122.5,73.1,74.9,171.4,160.6,104.5,...,11020,64508,93.2,67.1,11.5,170.8,14000,89530,,3 Oct 2024
8,AL,2009,9.0,37.4,112.7,69.2,70.8,160.6,155.3,103.8,...,10530,62430,89.9,64.9,10.9,168.6,13540,86500,,3 Oct 2024
9,AL,2010,7.4,33.9,102.7,62.8,64.2,148.8,152.4,101.8,...,10030,60005,86.4,62.5,10.4,167.2,13000,83040,,3 Oct 2024
10,AL,2011,6.1,30.6,99.6,58.9,60.0,142.7,150.2,102.7,...,9600,59301,85.2,61.8,10.0,162.0,12820,81730,,3 Oct 2024
11,AL,2012,5.3,26.7,97.4,55.4,56.5,134.6,147.9,104.1,...,8920,58400,83.2,60.8,9.3,152.7,12570,79890,,3 Oct 2024
12,AL,2013,4.3,22.1,86.5,48.2,49.1,134.6,147.5,103.6,...,8230,58111,81.9,60.4,8.6,141.6,12450,78780,,3 Oct 2024
13,AL,2014,3.6,20.8,82.3,45.3,46.0,134.4,150.9,110.2,...,8310,59378,83.7,61.8,8.7,140.0,12710,80400,,3 Oct 2024


In [280]:
age_groups = ['lt15', '1517', '1819', '1519', 'lt20', '2024', '2529', '3034', '3539', '40plus']
prefixes = ['pregnancyrate', 'abortionrate', 'birthrate', 'abortionratio', 'pregnancies', 'abortions', 'births', 'miscarriages', 'population']
variables_of_int = [
    f'{prefix}{age}' for prefix in prefixes
    for age in age_groups
] + [
    'populationsource', 'population1544', 'abortionstotal', 'birthstotal', 
    'pregnancyratetotal', 'birthratetotal', 'abortionratetotal', 
    'abortionratiototal', 'miscarriagestotal', 'pregnanciestotal'
]

melted_abortion_tot = nat_state_years.pivot_table(index="state", columns="year", values=variables_of_int, aggfunc="first")
melted_abortion_tot.reset_index(inplace=True)
melted_abortion_tot.columns = [f"{col1}_{col2}" if col1 and col2 else col1 for col1, col2 in melted_abortion_tot.columns]
melted_abortion_tot = melted_abortion_tot.rename(columns=lambda x: x.rstrip("_"))

In [281]:
age_groups = ['lt15', '1517', '1819', '1519', 'lt20', '2024', '2529', '3034', '3539', '40plus']
yrs = [f'_{year}' for year in range(2005, 2021)]
prefixes = ['abortionratio', 'pregnancyrate']
vars = [f'{pre}{age}{yr}' for pre in prefixes for yr in yrs for age in age_groups] 
ratio = vars + ["state"]


rate_abortion2 = melted_abortion_tot[ratio].melt(id_vars=["state"], var_name="year", value_name="rate_abortions")
rate_abortion = melted_abortion_tot.melt(
    id_vars=["state"], 
    value_vars=vars, 
    var_name="year", 
    value_name="rate"
)
rate_abortion["rate_type"] = rate_abortion["year"].str[:8]
rate_abortion['rate_type'] = rate_abortion['rate_type'].apply(
    lambda x: 'rate_abortions' if 'abortion' in x else 'rate_pregnancies'
)
rate_abortion["age"] = rate_abortion["year"].str[12:-5]
rate_abortion["year"] = rate_abortion["year"].str[-4:].astype(int)
rate_abortion.rename(columns={'state': 'State'}, inplace=True)
rate_abortion['age'] = rate_abortion['age'].apply(lambda x: x[1:] if x.startswith('e') else x)
rate_abortion['age'] = rate_abortion['age'].apply(lambda x: x[1:] if x.startswith('o') else x)
rate_abortion['age'] = rate_abortion['age'].apply(lambda x: x[3:] if x.startswith('olt') else x)

#rate_abortion.to_csv('Data/Cleaned_Datasets/Pregnancy_Abortion_Rate_Age_State.csv', index=False)


In [267]:
tot_rate_abortion = nat_state_years[["state", "abortionratetotal", "year"]]
tot_rate_abortion.columns = ["state", "AbortionTotal", "Year"]
#tot_rate_abortion.to_csv('Data/Cleaned_Datasets/TotalRateAbortions.csv', index=False)

## Cost of Abortion by State:

In [10]:
abortion_costs = pd.read_csv('Data/AbortionCostsByState.csv') #Source: https://www.ansirh.org/sites/default/files/2024-08/AFD%20Trends%20in%20Abortion%20Services%20in%20the%20United%20States%202017-2023_Final%20UPDATED.pdf
print(abortion_costs.shape)
abortion_costs.head()

(65, 22)


Unnamed: 0,Geographic Region and State,Median cost of medication abortion services 2017,Median cost of medication abortion services 2018,Median cost of medication abortion services 2019,Median cost of medication abortion services 2020,Median cost of medication abortion services 2021,Median cost of medication abortion services 2022,Median cost of medication abortion services 2023,Median cost of first trimester procedural abortion services 2017,Median cost of first trimester procedural abortion services 2018,...,Median cost of first trimester procedural abortion services 2021,Median cost of first trimester procedural abortion services 2022,Median cost of first trimester procedural abortion services 2023,Median cost of second trimester procedural abortion services 2017,Median cost of second trimester procedural abortion services 2018,Median cost of second trimester procedural abortion services 2019,Median cost of second trimester procedural abortion services 2020,Median cost of second trimester procedural abortion services 2021,Median cost of second trimester procedural abortion services 2022,Median cost of second trimester procedural abortion services 2023
0,United States (Total),495,500,560,560,568,560,563,475,495,...,625,625,650,935,960,*,898,775,800,1000
1,Northeast,495,495,535,500,550,550,550,450,456,...,555,575,650,802,852,*,770,650,713,896
2,New England,619,619,650,555,555,555,525,657,619,...,657,657,729,852,877,*,828,650,852,965
3,Connecticut,619,619,650,650,620,619,600,657,638,...,657,657,657,802,852,*,802,650,852,852
4,Maine,525,525,500,500,500,500,525,500,500,...,525,525,651,600,600,*,828,763,763,1000


In [13]:
state_map = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri",
    "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}
abortion_costs["State"] = abortion_costs["Geographic Region and State"].map({v: k for k, v in state_map.items()})


cost_over_time = abortion_costs[['Geographic Region and State',
 'Median cost of medication abortion services 2019',
 'Median cost of medication abortion services 2020',
 'Median cost of medication abortion services 2021',
 'Median cost of medication abortion services 2022',
 'Median cost of medication abortion services 2023']]


cost_over_time["State"] = cost_over_time["Geographic Region and State"].map({v: k for k, v in state_map.items()})
cost_over_time = cost_over_time.dropna(subset=["State"])
cost_over_time = cost_over_time.drop('Geographic Region and State', axis=1)
cost_over_time = cost_over_time.melt(id_vars=["State"], var_name="year", value_name="cost")
cost_over_time["year"] = cost_over_time["year"].str.extract(r"(\d+)").astype(int)
cost_over_time['cost'] = cost_over_time['cost'].replace('*', 0).replace('--', 1).astype(int)    
#cost_over_time.to_csv('Data/Cleaned_Datasets/AbortionCostOverTime.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cost_over_time["State"] = cost_over_time["Geographic Region and State"].map({v: k for k, v in state_map.items()})


## Number of Women per Clinic:

In [14]:
number_clinics_by_state = pd.read_csv('Data/NumberClinicsByState.csv') #Source: https://www.ansirh.org/sites/default/files/2024-08/AFD%20Trends%20in%20Abortion%20Services%20in%20the%20United%20States%202017-2023_Final%20UPDATED.pdf
print(number_clinics_by_state.shape)
number_clinics_by_state.head()

(65, 15)


Unnamed: 0,Geographic Region and State,Facilities (n) 2017,Facilities (n) 2018,Facilities (n) 2019,Facilities (n) 2020,Facilities (n) 2021,Facilities (n) 2022,Facilities (n) 2023,Women of Reproductive Age (15-49 years) per facility 2017,Women of Reproductive Age (15-49 years) per facility 2018,Women of Reproductive Age (15-49 years) per facility 2019,Women of Reproductive Age (15-49 years) per facility 2020,Women of Reproductive Age (15-49 years) per facility 2021,Women of Reproductive Age (15-49 years) per facility 2022,Women of Reproductive Age (15-49 years) per facility 2023
0,United States (Total),774,749,752,758,790,812,967,95866,99199,98936,98153,94177,91626,77923
1,Northeast,232,222,216,217,223,257,266,54759,56938,58176,57908,56350,48895,47651
2,New England,73,72,67,63,75,87,107,45951,46512,49827,52990,44512,38372,31526
3,Connecticut,19,18,18,12,18,22,27,41825,43957,43686,65529,43686,35743,29816
4,Maine,20,20,18,20,21,26,31,13773,13755,15276,13749,13094,10576,9066


In [15]:
served_centers_over_time = number_clinics_by_state[['Geographic Region and State','Women of Reproductive Age (15-49 years) per facility 2017',
 'Women of Reproductive Age (15-49 years) per facility 2018',
 'Women of Reproductive Age (15-49 years) per facility 2019',
 'Women of Reproductive Age (15-49 years) per facility 2020',
 'Women of Reproductive Age (15-49 years) per facility 2021',
 'Women of Reproductive Age (15-49 years) per facility 2022',
 'Women of Reproductive Age (15-49 years) per facility 2023']]


served_centers_over_time["State"] = served_centers_over_time["Geographic Region and State"].map({v: k for k, v in state_map.items()})
served_centers_over_time = served_centers_over_time.dropna(subset=["State"])
served_centers_over_time = served_centers_over_time.drop('Geographic Region and State', axis=1)

served_centers_over_time = served_centers_over_time.melt(id_vars=["State"], var_name="year", value_name="number_of_women_per_facility")
served_centers_over_time["year"] = served_centers_over_time["year"].str[-4:].astype(int)
served_centers_over_time["number_of_women_per_facility"] = served_centers_over_time["number_of_women_per_facility"].replace('--',115000000)
served_centers_over_time["number_of_women_per_facility"] = served_centers_over_time["number_of_women_per_facility"].replace({',': ''}, regex=True).astype(int)
served_centers_over_time["number_of_women_per_facility"] = served_centers_over_time["number_of_women_per_facility"].replace(115000000,np.nan)
served_centers_over_time['log_number_of_women_per_facility'] = np.log1p(served_centers_over_time['number_of_women_per_facility']) 
#served_centers_over_time.to_csv('Data/Cleaned_Datasets/WomenPerClinicOverTime.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  served_centers_over_time["State"] = served_centers_over_time["Geographic Region and State"].map({v: k for k, v in state_map.items()})


## Gestational Age of Aborted Pregnancies 2022

In [272]:
gestation = pd.read_csv('Data/NumberofAbortionsByGestationAgeState.csv') #https://www.cdc.gov/mmwr/volumes/73/ss/ss7307a1.htm?s_cid=ss7307a1_w
print(gestation.shape)
gestation.head()

(42, 9)


Unnamed: 0,Area,≤6,7–9,10–13,14–15,16–17,18–20,≥21,Total abortions reported by known gestational age
0,Alabama**,848 (22.5),"1,816 (48.1)",731 (19.4),171 (4.5),89 (2.4),97 (2.6),23 (0.6),"3,775 (99.9)"
1,Alaska,417 (33.4),518 (41.5),229 (18.4),39 (3.1),43 (3.4),—††,—††,"1,247 (100.0)"
2,Arizona,"3,444 (29.9)","5,319 (46.2)","1,885 (16.4)",382 (3.3),188 (1.6),172 (1.5),115 (1.0),"11,505 (99.8)"
3,Arkansas**,20 (1.2),530 (32.7),702 (43.3),155 (9.6),94 (5.8),106 (6.5),14 (0.9),"1,621 (100.0)"
4,Colorado,"4,915 (34.8)","5,829 (41.2)","1,907 (13.5)",341 (2.4),344 (2.4),313 (2.2),488 (3.5),"14,137 (99.9)"


In [273]:
gestation["State"] = gestation["Area"].map({v: k for k, v in state_map.items()})
gest = gestation.dropna(subset=["State"])
gest = gest.drop('Area', axis=1)
gest = gest.melt(id_vars=["State"], var_name="age", value_name="percent")
gest["percent"] = gest["percent"].str.extract(r'\((.*?)\)', expand=False)
gest['percent'] = gest['percent'].replace('NaN', 200).replace('††', 200).replace(np.nan, 200)
gest['percent'] = gest['percent'].astype(float)
gest['percent'] = gest['percent'].replace(200, np.nan)
gest = gest[gest['age'] != 'Total abortions reported by known gestational age']
gest = gest.dropna()
#gest.to_csv('Data/Cleaned_Datasets/GestationalAgeByState')

## Legality of Abortion by State:

In [2]:
legality = pd.read_csv('Data/NYTLegalityAbr.csv') #https://www.nytimes.com/interactive/2024/us/abortion-laws-roe-v-wade.html
legality = legality[legality['State'] != "Washington, D.C."]
state_map = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
    "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland",
    "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri",
    "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey",
    "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
    "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina",
    "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont",
    "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}
legality["state"] = legality["State"].map({v: k for k, v in state_map.items()})
legal = legality[["state", "Status of abortion","More details"]]
legal['state'] = legal['state'].astype('category')
legal['Status of abortion'] = legal['Status of abortion'].astype('category')
#legal.to_csv('Data/Cleaned_Datasets/legality')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legal['state'] = legal['state'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legal['Status of abortion'] = legal['Status of abortion'].astype('category')


## Combined Data for Regression:

In [277]:
number_clinics_by_state["State"] = number_clinics_by_state["Geographic Region and State"].map({v: k for k, v in state_map.items()})
number_clinics_by_state = number_clinics_by_state.dropna(subset=["State"])
number_clinics_by_state = number_clinics_by_state.drop('Geographic Region and State', axis=1)
table1 = number_clinics_by_state[['Facilities (n) 2017',
 'Facilities (n) 2018',
 'Facilities (n) 2019',
 'Facilities (n) 2020', 'State']].melt(id_vars=["State"], var_name="year", value_name="number_of_centers")
table1["year"] = table1["year"].str[-4:].astype(int)


table2 = nat_state_years[["abortionstotal", "miscarriagestotal", "state", "year"]]
table2.columns = ["abortionstotal", "miscarriagestotal", "State", "year"]


table = pd.merge(table1, table2, on=['State', 'year'])
table = table[['State', 'miscarriagestotal','abortionstotal', 'number_of_centers', 'year']]
table = table[table['State']!='DC']
table['number_of_centers'] = table['number_of_centers'].astype(int)

state_political_affiliation = {
    'AL': 'Republican',
    'AK': 'Republican',
    'AZ': 'Republican',
    'AR': 'Republican',
    'CA': 'Democrat',
    'CO': 'Democrat',
    'CT': 'Democrat',
    'DE': 'Democrat',
    'FL': 'Swing',
    'GA': 'Republican',
    'HI': 'Democrat',
    'ID': 'Republican',
    'IL': 'Democrat',
    'IN': 'Republican',
    'IA': 'Republican',
    'KS': 'Republican',
    'KY': 'Republican',
    'LA': 'Republican',
    'ME': 'Democrat',
    'MD': 'Democrat',
    'MA': 'Democrat',
    'MI': 'Democrat',
    'MN': 'Democrat',
    'MS': 'Republican',
    'MO': 'Republican',
    'MT': 'Republican',
    'NE': 'Republican',
    'NV': 'Democrat',
    'NH': 'Democrat',
    'NJ': 'Democrat',
    'NM': 'Democrat',
    'NY': 'Democrat',
    'NC': 'Swing',
    'ND': 'Republican',
    'OH': 'Republican',
    'OK': 'Republican',
    'OR': 'Democrat',
    'PA': 'Swing',
    'RI': 'Democrat',
    'SC': 'Republican',
    'SD': 'Republican',
    'TN': 'Republican',
    'TX': 'Republican',
    'UT': 'Republican',
    'VT': 'Democrat',
    'VA': 'Democrat',
    'WA': 'Democrat',
    'WV': 'Republican',
    'WI': 'Swing',
    'WY': 'Republican'
}


table['political_affiliation'] = table['State'].map(state_political_affiliation)
#table.to_csv('Data/Miscarriages_Abortions_Centers.csv', index=False)
