# Implicit Bias v Demographics

In [1]:
# ! pip install pyreadstat

import bs4
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
import pandas as pd
import requests

InteractiveShell.ast_node_interactivity = "all"

### _0. Download bias dataset to `iat.sav` and pickle to `iat.pkl` (already done)_

_This step is already done for you; no need to do it again._

Grab dataset `iat.sav` from 2018 from Harvard Project Implicit
 
    ! curl -L https://osf.io/z4bd2/download > iat.sav

Grab documentation for dataset columns `iat.xlsx`

    ! curl -L https://osf.io/szwuf/download > iat.xlsx
    pd.read_excel('iat.xlsx')

See also the Harvard Project Implicit Dataset homepage https://osf.io/y9hiq/  
and this [Blog post](https://app-prod-03.implicit.harvard.edu/implicit/user/jaxt/blogposts/piblogpost005.html)

Then we convert SPSS `iat.sav` to pickle `iat.pkl`  
It takes ~1h to read in the sav file

    df = pd.read_spss('iat.sav')
    df.to_pickle('iat.pkl')

### 1. Unpickle and clean bias dataset to `Iat`

In [2]:
Iat = pd.read_pickle('iat.pkl')[['raceomb_002', 'ethnicityomb', 'D_biep.White_Good_all', 'STATE']]
Iat.columns = ['Race','Ethnicity','Iat','State']

# Iat.head()
# Iat.describe(include='all').T

Iat = Iat[(Iat.Race=='White') & (Iat.Ethnicity=='Not Hispanic or Latino')]
Iat = Iat[['State','Iat']]

# Iat.head()
# np.array(Iat.State.unique())

Iat = pd.pivot_table(Iat,index='State',aggfunc=np.mean)
Iat = Iat.reset_index()

Iat.head()

Unnamed: 0,State,Iat
0,,0.349243
1,Alabama,0.372316
2,Alaska,0.330576
3,Arizona,0.349994
4,Arkansas,0.346702


### 2. Scrape and clean demograpic data to `df.Percent_Black`

In [3]:
link = 'https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations'
response = requests.get(link)
soup1 = bs4.BeautifulSoup(response.text, 'html.parser')

link = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_African-American_population'
response = requests.get(link)
soup2 = bs4.BeautifulSoup(response.text, 'html.parser')

table = soup1.find_all('table')[0] 
State = pd.read_html(str(table))[0]
State = State.iloc[11:11+51,[0,5]]
State = State.reset_index(drop=True)
State.columns = ['State','USPS']

# State.head(10).T

table = soup2.find_all('table')[1]
Demographics = pd.read_html(str(table))[0]
Demographics = Demographics.iloc[:, [0,2]]
Demographics.columns = ['Percent_Black', 'State']

# Demographics.head(10).T

df = pd.merge(State, Demographics, how='left', on='State')
df.Percent_Black = df.Percent_Black.str.strip('%')
df.Percent_Black = df.Percent_Black.astype(float)

df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
State,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida
USPS,AL,AK,AZ,AR,CA,CO,CT,DE,DC,FL
Percent_Black,26.38,4.27,4.16,15.76,6.67,4.28,10.34,20.95,50.08,15.91


### 3. Merge bias data `Iat` into demographic data `df`

In [4]:
df = pd.merge(df, Iat, how='left', on='State')
df.head()

Unnamed: 0,State,USPS,Percent_Black,Iat
0,Alabama,AL,26.38,0.372316
1,Alaska,AK,4.27,0.330576
2,Arizona,AZ,4.16,0.349994
3,Arkansas,AR,15.76,0.346702
4,California,CA,6.67,0.324187


### 4. Regression

# Next Steps
1. ~~Download sav file using `! curl` with redirect flag -L~~
1. Scrape permalink for wiki demographics
1. Redo using 2010 IAT data, to match demographics data's year