# A notebook for experiments

### Setup


In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyreadstat

## Read data

#### Read 2022 data


In [3]:
colspecs = [
    (8, 12),     #   0   DOB_YY         Birth Year
    (12, 14),    #   1   DOB_MM         Birth Month
    (31, 32),    #   2   BFACIL         Birth Place
    (32, 33),    #   3   F_BFACIL       Reporting Flag for Birth Place
    (72, 73),    #   4   MAGE_IMPFLG    Mother's Age Imputed
    (74, 75),    #   5   MAGE_REPFLG    Reported Age of Mother Used Flag
    (74, 76),    #   6   MAGER          Mother's Single Years of Age
    (76, 78),    #   7   MAGER14        Mother's Age Recode 14
    (78, 79),    #   8   MAGER9         Mother's Age Recode 9
    (83, 84),    #   9   MBSTATE_REC    Mother's Nativity
    (103, 104),  #  10   RESTATUS       Residence Status
    (104, 106),  #  11   MRACE31        Mother's Race Recode 31
    (106, 107),  #  12   MRACE6         Mother's Race Recode 6
    (107, 109),  #  13   MRACE15        Mother's Race Recode 15
    (110, 111),  #  14   MRACEIMP       Mother's Race Imputed Flag
    (111, 112),  #  15   MHISPX         Mother's Hispanic Origin
    (114, 115),  #  16   MHISP_R        Mother's Hispanic Origin Recode
    (115, 116),  #  17   F_MHISP        Reporting Flag for Mother's Origin
    (116, 117),  #  18   MRACEHISP      Mother's Race/Hispanic Origin
    (118, 119),  #  19   MAR_P          Paternity Acknowledged
    (119, 120),  #  20   DMAR           Marital Status
    (120, 121),  #  21   MAR_IMP        Mother's Marital Status Imputed
    (122, 123),  #  22   F_MAR_P        Reporting Flag for Paternity Acknowledged
    (123, 124),  #  23   MEDUC          Mother's Education
    (125, 126),  #  24   F_MEDUC        Reporting Flag for Education of Mother
    (141, 142),  #  25   FAGERPT_FLG    Father's Reported Age Used
    (146, 148),  #  26   FAGECOMB       Father's Combined Age
    (148, 150),  #  27   FAGEREC11      Father's Age Recode 11
    (150, 152),  #  28   FRACE31        Father's Race Recode 31
    (152, 153),  #  29   FRACE6         Father’s Race Recode 6
    (153, 155),  #  30   FRACE15        Father’s Race Recode 15
    (158, 159),  #  31   FHISPX         Father's Hispanic Origin
    (159, 160),  #  32   FHISP_R        Father's Hispanic Origin Recode
    (160, 161),  #  33   F_FHISP        Reporting Flag for Father's Hispanic Origin
    (161, 162),  #  34   FRACEHISP      Father's Race/Hispanic Origin
    (162, 163),  #  35   FEDUC          Father's Education
    (170, 172),  #  36   PRIORLIVE      Number of Previous Live Births
    (172, 174),  #  37   PRIORDEAD      Number of Previous Other Pregnancy Outcomes
    (174, 176),  #  38   PRIORTERM      Number of Previous Terminations
    (178, 179),  #  39   LBO_REC        Live Birth Order Recode
    (181, 182),  #  40   TBO_REC        Total Birth Order Recode
    (223, 225),  #  41   PRECARE        Month Prenatal Care Began
    (434, 435),  #  42   PAY            Payment Source for Delivery
    (435, 436),  #  43   PAY_REC        Payment Recode
    (436, 437),  #  44   F_PAY          Reporting Flag for Source of Payment
    (437, 438),  #  45   F_PAY_REC      Reporting Flag for Payment Recode
    (474, 475),  #  46   SEX            Sex of Infant
    (475, 476),  #  47   IMP_SEX        Imputed Sex
    (536, 537),  #  48   CA_ANEN        Anencephaly
    (537, 538),  #  49   CA_MNSB        Meningomyelocele / Spina Bifida
    (538, 539),  #  50   CA_CCHD        Cyanotic Congenital Heart Disease
    (539, 540),  #  51   CA_CDH         Congenital Diaphragmatic Hernia
    (540, 541),  #  52   OMPH           Omphalocele
    (541, 542),  #  53   CA_GAST        Gastroschisis
    (542, 543),  #  54   F_CA_ANEN      Reporting Flag for Anencephaly
    (543, 544),  #  55   F_CA_MENIN     Reporting Flag for Meningomyelocele/Spina Bifida
    (544, 545),  #  56   F_CA_HEART     Reporting Flag for Cyanotic Congenital Heart Disease
    (545, 546),  #  57   F_CA_HERNIA    Reporting Flag for Congenital Diaphragmatic Hernia
    (546, 547),  #  58   F_CA_OMPHA     Reporting Flag for Omphalocele
    (547, 548),  #  59   F_CA_GASTRO    Reporting Flag for Gastroschisis
    (548, 549),  #  60   CA_LIMB        Limb Reduction Defect
    (549, 550),  #  61   CA_CLEFT       Cleft Lip w/ or w/o Cleft Palate
    (550, 551),  #  62   CA_CLPAL       Cleft Palate alone
    (551, 552),  #       CA_DOWN        Down Syndrome
]

data_2022 = pd.read_fwf(
    "data/Nat2022us/Nat2022PublicUS.c20230504.r20230822.txt",
    colspecs=colspecs,
    header=None,
    dtype_backend="pyarrow",
).convert_dtypes()

data_2022.attrs["description"] = "2020 data"

Label columns and configure data types

In [9]:
data_2022.rename(
    columns={
        0: "DOB_YY",
        1: "DOB_MM",
        2: "BFACIL",
        3: "F_BFACIL",
        4: "MAGE_IMPFLG",
        5: "MAGE_REPFLG",
        6: "MAGER",
        7: "MAGER14",
        8: "MAGER9",
        9: "MBSTATE_REC",
        10: "RESTATUS",
        11: "MRACE31",
        12: "MRACE6",
        13: "MRACE15",
        14: "MRACEIMP",
        15: "MHISPX",
        16: "MHISP_R",
        17: "F_MHISP",
        18: "MRACEHISP",
        19: "MAR_P",
        20: "DMAR",
        21: "MAR_IMP",
        22: "F_MAR_P",
        23: "MEDUC",
        24: "F_MEDUC",
        25: "FAGERPT_FLG",
        26: "FAGECOMB",
        27: "FAGEREC11",
        28: "FRACE31",
        29: "FRACE6",
        30: "FRACE15",
        31: "FHISPX",
        32: "FHISP_R",
        33: "F_FHISP",
        34: "FRACEHISP",
        35: "FEDUC",
        36: "PRIORLIVE",
        37: "PRIORDEAD",
        38: "PRIORTERM",
        39: "LBO_REC",
        40: "TBO_REC",
        41: "PRECARE",
        42: "PAY",
        43: "PAY_REC",
        44: "F_PAY",
        45: "F_PAY_REC",
        46: "SEX",
        47: "IMP_SEX",
        48: "CA_ANEN",
        49: "CA_MNSB",
        50: "CA_CCHD",
        51: "CA_CDH",
        52: "OMPH",
        53: "CA_GAST",
        54: "F_CA_ANEN",
        55: "F_CA_MENIN",
        56: "F_CA_HEART",
        57: "F_CA_HERNIA",
        58: "F_CA_OMPHA",
        59: "F_CA_GASTRO",
        60: "CA_LIMB",
        61: "CA_CLEFT",
        62: "CA_CLPAL",
        63: "CA_DOWN",
    },
    inplace=True,
)

data_2022 = data_2022.astype({
    "DOB_YY": "uint16[pyarrow]",
    "DOB_MM": "category",
    "BFACIL": "category",
    "F_BFACIL": "category",
    "MAGE_IMPFLG": "category",
    "MAGE_REPFLG": "category",
    "MAGER": "category",
    "MAGER14": "category",
    "MAGER9": "category",
    "MBSTATE_REC": "category",
    "RESTATUS": "category",
    "MRACE31": "category",
    "MRACE6": "category",
    "MRACE15": "category",
    "MRACEIMP": "category",
    "MHISPX": "category",
    "MHISP_R": "category",
    "F_MHISP": "category",
    "MRACEHISP": "category",
    "MAR_P": "category",
    "DMAR": "category",
    "MAR_IMP": "category",
    "F_MAR_P": "category",
    "MEDUC": "category",
    "F_MEDUC": "category",
    "FAGERPT_FLG": "category",
    "FAGECOMB": "category",
    "FAGEREC11": "category",
    "FRACE31": "category",
    "FRACE6": "category",
    "FRACE15": "category",
    "FHISPX": "category",
    "FHISP_R": "category",
    "F_FHISP": "category",
    "FRACEHISP": "category",
    "FEDUC": "category",
    "PRIORLIVE": "category",
    "PRIORDEAD": "category",
    "PRIORTERM": "category",
    "LBO_REC": "category",
    "TBO_REC": "category",
    "PRECARE": "category",
    "PAY": "category",
    "PAY_REC": "category",
    "F_PAY": "category",
    "F_PAY_REC": "category",
    "SEX": "category",
    "IMP_SEX": "category",
    "CA_ANEN": "category",
    "CA_MNSB": "category",
    "CA_CCHD": "category",
    "CA_CDH": "category",
    "OMPH": "category",
    "CA_GAST": "category",
    "F_CA_ANEN": "category",
    "F_CA_MENIN": "category",
    "F_CA_HEART": "category",
    "F_CA_HERNIA": "category",
    "F_CA_OMPHA": "category",
    "F_CA_GASTRO": "category",
    "CA_LIMB": "category",
    "CA_CLEFT": "category",
    "CA_CLPAL": "category",
    "CA_DOWN": "category",
})

In [None]:
data_2022

Unnamed: 0,DOB_YY,DOB_MM,BFACIL,F_BFACIL,MAGE_IMPFLG,MAGE_REPFLG,MAGER,MAGER14,MAGER9,MBSTATE_REC,...,F_CA_ANEN,F_CA_MENIN,F_CA_HEART,F_CA_HERNIA,F_CA_OMPHA,F_CA_GASTRO,CA_LIMB,CA_CLEFT,CA_CLPAL,CA_DOWN
0,2022,1,2,1,,3,37,11,6,1,...,1,1,1,1,1,1,N,N,N,N
1,2022,1,3,1,,2,27,9,4,1,...,1,1,1,1,1,1,N,N,N,N
2,2022,1,3,1,,2,25,9,4,1,...,1,1,1,1,1,1,N,N,N,N
3,2022,1,3,1,,4,42,12,7,1,...,1,1,1,1,1,1,N,N,N,N
4,2022,1,1,1,,2,27,9,4,2,...,1,1,1,1,1,1,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3676024,2022,12,1,1,,3,39,11,6,2,...,1,1,1,1,1,1,N,N,N,N
3676025,2022,4,3,1,,3,31,10,5,2,...,1,1,1,1,1,1,U,U,U,U
3676026,2022,5,3,1,,3,35,11,6,1,...,1,1,1,1,1,1,U,U,U,U
3676027,2022,6,4,1,,2,26,9,4,1,...,1,1,1,1,1,1,U,U,U,U


In [10]:
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3676029 entries, 0 to 3676028
Data columns (total 64 columns):
 #   Column       Dtype          
---  ------       -----          
 0   DOB_YY       uint16[pyarrow]
 1   DOB_MM       category       
 2   BFACIL       category       
 3   F_BFACIL     category       
 4   MAGE_IMPFLG  category       
 5   MAGE_REPFLG  category       
 6   MAGER        category       
 7   MAGER14      category       
 8   MAGER9       category       
 9   MBSTATE_REC  category       
 10  RESTATUS     category       
 11  MRACE31      category       
 12  MRACE6       category       
 13  MRACE15      category       
 14  MRACEIMP     category       
 15  MHISPX       category       
 16  MHISP_R      category       
 17  F_MHISP      category       
 18  MRACEHISP    category       
 19  MAR_P        category       
 20  DMAR         category       
 21  MAR_IMP      category       
 22  F_MAR_P      category       
 23  MEDUC        category       
 24

In [None]:
data_2022["CA_DOWN"].value_counts()

CA_DOWN
N    3666667
U       7501
P       1043
C        818
Name: count, dtype: int64

In [None]:
us_residents_only = data_2022[data_2022["RESTATUS"] != 4]

us_residents_only["CA_DOWN"].value_counts()

CA_DOWN
N    3658502
U       7399
P       1039
C        818
Name: count, dtype: int64

In [11]:
foreign = data_2022[data_2022["RESTATUS"] == 4]

foreign_ds = foreign["CA_DOWN"].value_counts()
foreign_ds

CA_DOWN
N    8165
U     102
P       4
C       0
Name: count, dtype: int64

In [12]:
race_recode_31 = foreign["MRACE31"].value_counts()
race_recode_31

MRACE31
1     7628
2      318
4      257
3       22
13      13
6       11
5        8
10       8
8        2
18       1
15       1
23       1
7        1
28       0
27       0
22       0
30       0
26       0
25       0
24       0
29       0
16       0
21       0
20       0
19       0
17       0
14       0
12       0
11       0
9        0
31       0
Name: count, dtype: int64

In [None]:
race_recode_6 = foreign["MRACE6"].value_counts()
race_recode_6

MRACE6
1    7628
2     318
4     257
6      38
3      22
5       8
Name: count, dtype: int64

In [None]:
race_recode_15 = foreign["MRACE15"].value_counts()
race_recode_15

MRACE15
1     7628
2      318
5       95
4       69
10      39
15      38
6       28
3       22
9       10
7        9
8        7
14       5
12       2
11       1
13       0
Name: count, dtype: int64

In [None]:
m_hisp_origin = foreign["MHISPX"].value_counts()
m_hisp_origin

MHISPX
1    6507
0    1181
9     312
4     131
6     108
5      13
2      11
3       8
Name: count, dtype: int64

Export

In [None]:
# pyreadstat.write_sav(data_2022, "data/data_2022.sav")

# pyreadstat.write_dta(data_2022, "data/data_2022.dta")

# data_2022.to_hdf("data/data_2022.h5", key="data_2022", format="table")

# data_2022.to_csv("data/data_2022.csv", index=False)