# A notebook for experiments

### Setup


In [28]:
import numpy as np
import pandas as pd
import pyreadstat

## Read data

#### Read 2022 data


In [29]:
colspecs = [
    (8, 12),     #   0   DOB_YY          Birth Year
    (12, 14),    #   1   DOB_MM          Birth Month
    (31, 32),    #   2   BFACIL          Birth Place
    (32, 33),    #   3   F_BFACIL        Reporting Flag for Birth Place
    (72, 73),    #   4   MAGE_IMPFLG     Mother's Age Imputed
    (74, 75),    #   5   MAGE_REPFLG     Reported Age of Mother Used Flag
    (74, 76),    #   6   MAGER           Mother's Single Years of Age
    (76, 78),    #   7   MAGER14         Mother's Age Recode 14
    (78, 79),    #   8   MAGER9          Mother's Age Recode 9
    (83, 84),    #   9   MBSTATE_REC     Mother's Nativity
    (103, 104),  #  10   RESTATUS        Residence Status
    (104, 106),  #  11   MRACE31         Mother's Race Recode 31
    (106, 107),  #  12   MRACE6          Mother's Race Recode 6
    (107, 109),  #  13   MRACE15         Mother's Race Recode 15
    (110, 111),  #  14   MRACEIMP        Mother's Race Imputed Flag
    (111, 112),  #  15   MHISPX          Mother's Hispanic Origin
    (114, 115),  #  16   MHISP_R         Mother's Hispanic Origin Recode
    (115, 116),  #  17   F_MHISP         Reporting Flag for Mother's Origin
    (116, 117),  #  18   MRACEHISP       Mother's Race/Hispanic Origin
    (118, 119),  #  19   MAR_P           Paternity Acknowledged
    (119, 120),  #  20   DMAR            Marital Status
    (120, 121),  #  21   MAR_IMP         Mother's Marital Status Imputed
    (122, 123),  #  22   F_MAR_P         Reporting Flag for Paternity Acknowledged
    (123, 124),  #  23   MEDUC           Mother's Education
    (125, 126),  #  24   F_MEDUC         Reporting Flag for Education of Mother
    (141, 142),  #  25   FAGERPT_FLG     Father's Reported Age Used
    (146, 148),  #  26   FAGECOMB        Father's Combined Age
    (148, 150),  #  27   FAGEREC11       Father's Age Recode 11
    (150, 152),  #  28   FRACE31         Father's Race Recode 31

    (551, 552),  #       CA_DOWN         Down Syndrome
]

data_2022 = pd.read_fwf(
    "data/Nat2022us/Nat2022PublicUS.c20230504.r20230822.txt",
    colspecs=colspecs,
    header=None,
)

data_2022.attrs["description"] = "2020 data"

data_2022.rename(
    columns={
        0: "DOB_YY",
        1: "DOB_MM",
        2: "BFACIL",
        3: "F_BFACIL",
        4: "MAGE_IMPFLG",
        5: "MAGE_REPFLG",
        6: "MAGER",
        7: "MAGER14",
        8: "MAGER9",
        9: "MBSTATE_REC",
        10: "RESTATUS",
        11: "MRACE31",
        12: "MRACE6",
        13: "MRACE15",
        14: "MRACEIMP",
        15: "MHISPX",
        16: "MHISP_R",
        17: "F_MHISP",
        18: "MRACEHISP",
        19: "MAR_P",
        20: "DMAR",
        21: "MAR_IMP",
        22: "F_MAR_P",
        23: "MEDUC",
        24: "F_MEDUC",
        25: "FAGERPT_FLG",
        26: "FAGECOMB",
        27: "FAGEREC11",
        28: "FRACE31",
        29: "CA_DOWN",
    },
    inplace=True,
)

data_2022 = data_2022.astype(
    {"DOB_YY": "int32"},
    {"DOB_MM": "int32"},
)

In [30]:
data_2022

Unnamed: 0,DOB_YY,DOB_MM,BFACIL,F_BFACIL,MAGE_IMPFLG,MAGE_REPFLG,MAGER,MAGER14,MAGER9,MBSTATE_REC,...,DMAR,MAR_IMP,F_MAR_P,MEDUC,F_MEDUC,FAGERPT_FLG,FAGECOMB,FAGEREC11,FRACE31,CA_DOWN
0,2022,1,2,1,,3,37,11,6,1,...,1.0,,1,6,1,,40,7,1,N
1,2022,1,3,1,,2,27,9,4,1,...,1.0,,1,3,1,,33,5,1,N
2,2022,1,3,1,,2,25,9,4,1,...,1.0,,1,5,1,,27,4,1,N
3,2022,1,3,1,,4,42,12,7,1,...,1.0,,1,7,1,,55,10,1,N
4,2022,1,1,1,,2,27,9,4,2,...,1.0,,1,3,1,,27,4,1,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3676024,2022,12,1,1,,3,39,11,6,2,...,2.0,,1,2,1,,29,4,1,N
3676025,2022,4,3,1,,3,31,10,5,2,...,1.0,,1,9,1,,35,6,99,U
3676026,2022,5,3,1,,3,35,11,6,1,...,1.0,,1,9,1,,30,5,99,U
3676027,2022,6,4,1,,2,26,9,4,1,...,1.0,,1,9,1,,28,4,99,U


In [31]:
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3676029 entries, 0 to 3676028
Data columns (total 30 columns):
 #   Column       Dtype  
---  ------       -----  
 0   DOB_YY       int32  
 1   DOB_MM       int64  
 2   BFACIL       int64  
 3   F_BFACIL     int64  
 4   MAGE_IMPFLG  float64
 5   MAGE_REPFLG  int64  
 6   MAGER        int64  
 7   MAGER14      int64  
 8   MAGER9       int64  
 9   MBSTATE_REC  int64  
 10  RESTATUS     int64  
 11  MRACE31      int64  
 12  MRACE6       int64  
 13  MRACE15      int64  
 14  MRACEIMP     float64
 15  MHISPX       int64  
 16  MHISP_R      int64  
 17  F_MHISP      int64  
 18  MRACEHISP    int64  
 19  MAR_P        object 
 20  DMAR         float64
 21  MAR_IMP      float64
 22  F_MAR_P      int64  
 23  MEDUC        int64  
 24  F_MEDUC      int64  
 25  FAGERPT_FLG  float64
 26  FAGECOMB     int64  
 27  FAGEREC11    int64  
 28  FRACE31      int64  
 29  CA_DOWN      object 
dtypes: float64(5), int32(1), int64(22), object(2)
memo

In [32]:
pyreadstat.write_sav(data_2022, "data_2022.sav")