# Notes - race and ethnicity


In [1]:
import duckdb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from graphviz import Digraph
from sympy.physics.units import magnetic_density
from variables import Variables as vars

plt.style.use("../../notebook.mplstyle")

os.makedirs("./outputs", exist_ok=True)

In [2]:
con = duckdb.connect("./data/us_births.db")

In [None]:
con.execute(
    """
    -- note: 2019-2014 prefer mbrace15
    -- if `mrace15` is available, use `mrace15`, 1:1, 2:2, 3:3, 4-14:4, otherwise,
    -- if `mracerec` is available, use `mracerec`, 1:1, 2:2, 3:3, 4:4, otherwise,
    -- if `mbrace` is available, use `mbrace`, 1:1, 2:2, 3:3, 4:4, otherwise,
    -- if `mrace` is available, use `mrace`, 1:1, 2:2, 3:3, 4-78:4, otherwise,    
    UPDATE us_births
    SET mrace_c = CASE
        WHEN mrace15 IS NOT NULL AND (year < 2014 OR year > 2019) THEN
            CASE
                WHEN mrace15 IN(1, 2, 3) THEN mrace15
                WHEN mrace15 BETWEEN 4 AND 14 THEN 4
            END
        WHEN mracerec IS NOT NULL AND (year < 2014 OR year > 2019) THEN
            CASE
                WHEN mracerec IN(1, 2, 3, 4) THEN mracerec
            END
        WHEN mbrace IS NOT NULL THEN
            CASE
                WHEN mbrace IN(1, 2, 3, 4) THEN mbrace
            END
        WHEN mrace IS NOT NULL THEN
            CASE
                WHEN mrace IN(1, 2, 3) THEN mrace
                WHEN mrace BETWEEN 4 AND 78 THEN 4
            END
        ELSE NULL
    END
    """
)

<_duckdb.DuckDBPyConnection at 0x1d1d4c6dcf0>

In [None]:
race_df = con.execute(
    """
    SELECT
        year,
        COUNT(*) as total,
        COUNT(mrace_c),
        COUNT(mrace),
        COUNT(mracerec),
        COUNT(mraceimp),
        COUNT(mbrace),
        COUNT(mrace31),
        COUNT(mrace6),
        COUNT(mrace15),
        COUNT(umhisp),
        COUNT(mhispx),
        COUNT(mhisp_r),
        COUNT(mracehisp),
        COUNT(orracem)
    FROM
        us_births
    WHERE
        year >= 1989
    GROUP BY year
    ORDER BY year
    """
).df()

race_df.to_csv(f"./outputs/race_counts_by_year_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv")
race_df

Unnamed: 0,year,total,count(mrace_c),count(mrace),count(mracerec),count(mraceimp),count(mbrace),count(mrace31),count(mrace6),count(mrace15),count(umhisp),count(mhispx),count(mhisp_r),count(mracehisp),count(orracem)
0,1989,4045693,4045693,4045693,0,14781,0,0,0,0,0,0,0,0,4045693
1,1990,4162917,4162917,4162917,0,12625,0,0,0,0,0,0,0,0,4162917
2,1991,4115342,4115342,4115342,0,12884,0,0,0,0,0,0,0,0,4115342
3,1992,4069428,4069428,4069428,0,16171,0,0,0,0,0,0,0,0,4069428
4,1993,4004523,4004523,4004523,0,17761,0,0,0,0,0,0,0,0,4004523
5,1994,3956925,3956925,3956925,0,21723,0,0,0,0,0,0,0,0,3956925
6,1995,3903012,3903012,3903012,0,23469,0,0,0,0,0,0,0,0,3903012
7,1996,3894874,3894874,3894874,0,27455,0,0,0,0,0,0,0,0,3894874
8,1997,3884329,3884329,3884329,0,27927,0,0,0,0,0,0,0,0,3884329
9,1998,3945192,3945192,3945192,0,28627,0,0,0,0,0,0,0,0,3945192


Race variables include:

#### `MRACE`(1989-2013, though declining from 2003)

```
01 White
02 Black
03 American Indian / Alaskan Native
04 Chinese
05 Japanese
06 Hawaiian(includes part Hawaiian)
07 Filipino
18 Asian Indian
28 Korean
38 Samoan
48 Vietnamese
58 Guamanian
68 Other Asian / Pacific Islander in areas reporting codes 18-58
78 Combined other Asian / Pacific Islander includes 18-68 for areas that do not report them separately
```

#### `MRACEREC`(from 2003-2013)

```
1 White
2 Black
3 American Indian / Alaskan Native
4 Asian / Pacific Islander
```

#### `MBRACE`(2003-2019)

```
1 White
2 Black
3 American Indian or Alaskan Native
4 Asian or Pacific Islander
(Puerto Rico excludes 3 and 4)
```

#### `MRACE15`(from 2014)

```
01 White(only)
02 Black(only)
03 American Indian / Alaskan Native(only)
04 Asian Indian(only)
05 Chinese(only)
06 Filipino(only)
07 Japanese(only)
08 Korean(only)
09 Vietnamese(only)
10 Other Asian(only)
11 Hawaiian(only)
12 Guamanian(only)
13 Samoan(only)
14 Other Pacific Islander(only)
15 More than one race
```

#### `MRACE6`(from 2018) - can be derived from `MRACE15`

```
1 White(only)
2 Black(only)
3 American Indian / Alaskan Native(only)
4 Asian(only)
5 Native Hawaiian or Other Pacific Islander(only)
6 More than one race
```

We combine as follows to get back to 1989:

```
MRACE_C(combined)
1 White
2 Black
3 American Indian or Alaskan Native
4 Asian or Pacific Islander
```

For 2014 on, we have MRACE15, which is summarised in MRACE6 and where more than one race are broken out in MRACE31.

We set `mrace_c` as follows:

- if `mrace15` is available, use `mrace6`, 1:1, 2:2, 3:3, 4-14:4, otherwise,
- if `mracerec` is available, use `mracerec`, 1:1, 2:2, 3:3, 4:4, otherwise,
- if `mbrace` is available, use `mbrace`, 1:1, 2:2, 3:3, 4:4, otherwise,
- if `mrace` is available, use `mrace`, 1:1, 2:2, 3:3, 4-78:4, otherwise,
- missing.


In [21]:
mrace_c_df = (
    con.execute(
        f"""
        SELECT
            year,
            down_ind,
            mrace_c
        FROM us_births
        ORDER BY year, mrace_c, down_ind;
        """
    )
    .df()
    .dropna()
    .set_index(vars.YEAR)
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [26]:
r_df = con.execute(
    """
    SELECT *
    FROM(
        SELECT year, mrace_c
        FROM us_births
        -- WHERE down_ind = 1
    )
    PIVOT(
        COUNT(*)
        FOR mrace_c IN(1, 2, 3, 4)
    )
    ORDER BY year;
    """).df().set_index(vars.YEAR)
r_df.to_csv(f"./outputs/race_counts_live_births_by_year_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv")

In [24]:
# pivot on  down_ind and get counts of mrace_c
mrace_c_pivot_df = mrace_c_df.pivot_table(
    index=vars.YEAR,
    columns=[vars.MRACE_C],
    aggfunc="size",
)
#mrace_c_pivot_df.columns = [f"mrace_c_{col[0]}/down_ind_{col[1]}" for col in mrace_c_pivot_df.columns]
mrace_c_pivot_df = mrace_c_pivot_df.reset_index()
mrace_c_pivot_df.to_csv(f"./outputs/mrace_c_down_ind_pivot_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False)
mrace_c_pivot_df

mrace_c,year,1,2,3,4
0,1989,2664898,525411,28752,115965
1,1990,2898946,586950,29268,126848
2,1991,2863999,590600,32792,130070
3,1992,2868892,589823,32992,133625
4,1993,2970180,600514,32969,139691
5,1994,2980662,586451,32726,144463
6,1995,2916069,532625,32433,143909
7,1996,3028534,586660,33483,163598
8,1997,3010186,591604,33754,163720
9,1998,3048106,601601,34801,166978


For Hispanic, we have:

MRACEHISP(from 2003)

```
1 Non-Hispanic White(only)
2 Non-Hispanic Black(only)
3 Non-Hispanic AIAN(only)
4 Non-Hispanic Asian(only)
5 Non-Hispanic NHOPI(only)
6 Non-Hispanic more than one race
7 Hispanic
8 Origin unknown or not stated
```

UMHISP(2003-2013) - slightly better counts than MRACEHISP

```
0 Non-Hispanic
1 Mexican
2 Puerto Rican
3 Cuban
4 Central American
5 Other and Unknown Hispanic
9 Origin unknown or not stated
```

ORRACEM(from 1989-2002)

```
1 Mexican
2 Puerto Rican
3 Cuban
4 Central or South American
5 Other and unknown Hispanic
6 Non-Hispanic White
7 Non-Hispanic Black
8 Non-Hispanic other races
9 Origin unknown or not stated
```

MHISP_R(from 2014)

```
0 Non-Hispanic
1 Mexican
2 Puerto Rican
3 Cuban
4 Central and South American
5 Other and Unknown Hispanic origin
9 Hispanic origin not stated
```

MHISPX(from 2018)

```
0 Non-Hispanic
1 Mexican
2 Puerto Rican
3 Cuban
4 Central or South American
5 Dominican
6 Other and Unknown Hispanic
9 Origin unknown or not stated
```

We merge to:

MHISP_C

```
0 Non-Hispanic
1 Mexican
2 Puerto Rican
3 Cuban
4 Other and Unknown Hispanic
5 Origin unknown or not stated
```

Rules:

- if `mhisp_r` is available, then 0:0, 1:1, 2:2, 3:3, 4-5:4, 9:5, otherwise
- if `mhispx` is available, then 0:0, 1:1, 2:2, 3:3, 4-6:4, 9:5, otherwise
- if `umhisp` is available, then 0:0, 1:1, 2:2, 3:3, 4-5:4, 9:5, otherwise
- if `orracem` is available, then 6-8:0, 1:1, 2:2, 3:3, 4-5:4, 9:5, otherwise
- missing
