# Translate NAMCS2016's DIAG1 column from ICD10 to ICD9 codes

Note this fixes things in-place, and so should only be run once.

In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from pandas.util.testing import assert_frame_equal
pd.set_option("display.max_columns",200)

In [2]:
tmp = pd.read_csv('NAMCS/namcs2016.csv', dtype=str)
translate = pd.read_csv('data_files/2016_I9gem2.csv', dtype='bytes', names=['ICD9','ICD10'])

This first part translates the diagnosis codes

In [3]:
tmp.DIAG1

0        7455
1        7454
2        4168
3        7454
4        4168
5        7455
6        7455
7        7455
8        7454
9        7470
10       4279
11       7470
12       7455
13       4279
14       7459
15       4279
16        NaN
17       4279
18       7455
19       4240
20       7455
21        NaN
22       7455
23       7470
24        NaN
25       7061
26       0780
27       3659
28       3659
29       3669
         ... 
13135     NaN
13136     NaN
13137     NaN
13138    5531
13139    5531
13140     NaN
13141    7062
13142    5759
13143     NaN
13144     NaN
13145     NaN
13146     NaN
13147     NaN
13148    4556
13149     NaN
13150     NaN
13151    7944
13152     NaN
13153     NaN
13154     NaN
13155     NaN
13156     NaN
13157     NaN
13158    1539
13159     NaN
13160     NaN
13161     NaN
13162    1543
13163    1830
13164     NaN
Name: DIAG1, Length: 13165, dtype: object

In [9]:
tmp.DIAG5

0          -9
1          -9
2          -9
3          -9
4          -9
5          -9
6          -9
7          -9
8          -9
9          -9
10         -9
11         -9
12         -9
13         -9
14         -9
15         -9
16         -9
17         -9
18         -9
19         -9
20         -9
21         -9
22         -9
23         -9
24         -9
25         -9
26         -9
27       H21-
28         -9
29         -9
         ... 
13135      -9
13136    K648
13137      -9
13138      -9
13139      -9
13140    N189
13141      -9
13142      -9
13143      -9
13144      -9
13145      -9
13146      -9
13147      -9
13148    M199
13149    D649
13150      -9
13151      -9
13152      -9
13153    C349
13154      -9
13155    T451
13156      -9
13157      -9
13158      -9
13159      -9
13160      -9
13161      -9
13162      -9
13163      -9
13164      -9
Name: DIAG5, Length: 13165, dtype: object

In [10]:
def g(x):
#     x = str(x, 'utf-8')
    if str(x) not in ['ZZZ0', 'ZZZ1', 'ZZZ2','ZZZ3','ZZZ4','ZZZ5','-9']:
        return x
    else:
        return np.nan

def f(x):
#     x = str(x, 'utf-8')
    if x in translate.ICD10.values:
        val = translate.loc[translate.ICD10 == x].ICD9
        return val.values[0]
    else:
        return np.nan

In [5]:
string = f"DIAG1"
tmp[string] =tmp[string].apply(g).apply(f)

In [12]:
strings = [f"DIAG2",f"DIAG3",f"DIAG4",f"DIAG5"]
for string in strings:
    tmp[string] =tmp[string].apply(g).apply(f)

In [13]:
tmp.DIAG2

0          NaN
1         4168
2         7470
3          NaN
4         7454
5         7459
6          NaN
7         7469
8          NaN
9         4179
10        7455
11        7454
12        4279
13        7454
14        4279
15        7454
16        7455
17        7470
18        4241
19        7454
20         NaN
21        7455
22         NaN
23        4279
24        V682
25         NaN
26         NaN
27        3669
28        3669
29         NaN
         ...  
13135      NaN
13136    29632
13137     1539
13138    55321
13139      NaN
13140      NaN
13141      NaN
13142      NaN
13143      NaN
13144      NaN
13145      NaN
13146      NaN
13147      NaN
13148    53081
13149      NaN
13150      NaN
13151      NaN
13152      NaN
13153      NaN
13154      NaN
13155     V103
13156      NaN
13157      NaN
13158      NaN
13159    72982
13160      NaN
13161      NaN
13162    25000
13163      NaN
13164      NaN
Name: DIAG2, Length: 13165, dtype: object

In [14]:
# show the fields to keep
fields = pd.read_csv("data_files/keep_cols.csv", names=['titles'])
cols = [col for col in tmp.columns.tolist() if col in list(fields['titles'])]
tmp[cols]    

Unnamed: 0,VMONTH,VDAYR,AGE,AGER,AGEDAYS,SEX,PREGNANT,GESTWK,ETHUN,RACEUN,ETHIM,PAYTYPER,RFV1,MAJOR,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,HTIN,WTLB,BMI,TEMPF,BPSYS,BPDIAS,GLUCOSE,BONEDENS,SUBSTED,MED1,MED2,MED3,HTWTFL,YEAR,PATWT
0,3,2,4,1,-7,1,-9,-9,1,-9,1,2,33500,3,7455,,,,,42,42,16.74,974,-9,-9,0,0,0,09545,14930,b'13118',0,2016,241154.17224
1,3,2,6,1,-7,1,-9,-9,1,-9,1,3,65000,3,7454,4168,,,,43,48,18.25,976,-9,-9,0,0,0,09545,19210,b'14930',0,2016,241154.17224
2,3,2,3,1,-7,1,-9,-9,2,2,2,3,42050,5,4168,7470,,,,38,32,15.58,978,-9,-9,1,0,0,09545,15600,b'29250',0,2016,241154.17224
3,3,2,3,1,-7,1,-9,-9,2,1,2,3,25200,2,7454,,7852,,,38,36,17.53,978,-9,-9,1,0,0,09545,13118,b'11540',0,2016,241154.17224
4,3,2,5,1,-7,2,-7,-7,1,-9,1,3,14300,2,4168,7454,,,,41,46,19.24,974,-9,-9,0,0,0,09545,19210,b'93313',0,2016,241154.17224
5,3,2,5,1,-7,1,-9,-9,2,2,2,3,10150,3,7455,7459,7454,,,40,35,15.38,978,-9,-9,1,0,0,09545,13118,b'-9',0,2016,241154.17224
6,3,2,5,1,-7,1,-9,-9,2,1,2,3,10150,2,7455,,,,,39,40,18.49,980,-9,-9,1,0,0,29250,14930,b'09545',0,2016,241154.17224
7,3,2,8,1,-7,1,-9,-9,1,-9,1,3,14051,2,7455,7469,,,,49,56,16.4,980,-9,-9,1,0,0,09545,11540,b'93433',0,2016,241154.17224
8,3,2,6,1,-7,2,-7,-7,2,1,2,1,14150,2,7454,,,,,47,46,14.64,975,-9,-9,1,0,0,09545,29250,b'14930',0,2016,241154.17224
9,3,3,4,1,-7,1,-9,-9,1,-9,1,3,59300,2,7470,4179,7469,,,38,36,17.53,979,-9,-9,1,0,0,09545,19210,b'14930',0,2016,241154.17224


This part is to fix the medication columns

In [8]:
print(tmp[['MED1','MED2']])

           MED1      MED2
0      b'09545'  b'14930'
1      b'09545'  b'19210'
2      b'09545'  b'15600'
3      b'09545'  b'13118'
4      b'09545'  b'19210'
5      b'09545'  b'13118'
6      b'29250'  b'14930'
7      b'09545'  b'11540'
8      b'09545'  b'29250'
9      b'09545'  b'19210'
10     b'09545'  b'11540'
11     b'15600'  b'09545'
12     b'09545'  b'13118'
13     b'09545'  b'13118'
14     b'09545'  b'13118'
15     b'09545'  b'10225'
16     b'09545'  b'11540'
17     b'09545'  b'19210'
18     b'09545'  b'29250'
19     b'09545'  b'19210'
20     b'09545'  b'29250'
21     b'09545'  b'14930'
22     b'13118'  b'14930'
23     b'09545'  b'15600'
24     b'07289'     b'-9'
25     b'09645'     b'-9'
26     b'10018'  b'12222'
27     b'60550'     b'-9'
28     b'60550'     b'-9'
29        b'-9'     b'-9'
...         ...       ...
13135  b'17370'  b'61245'
13136  b'25674'     b'-9'
13137  b'93396'     b'-9'
13138  b'91062'  b'06985'
13139     b'-9'     b'-9'
13140  b'01030'  b'02805'
13141  b'010

In [9]:
def h(x):
    return str(x)[2:-1]

for j in [1,2]:
    string = f"MED{j}"
    tmp[string] =tmp[string].apply(h)

In [15]:
def h(x):
    return str(x)[2:-1]

for j in [3]:
    string = f"MED{j}"
    tmp[string] =tmp[string].apply(h)

In [16]:
print(tmp[['MED1','MED2','MED3']])

        MED1   MED2   MED3
0      09545  14930  13118
1      09545  19210  14930
2      09545  15600  29250
3      09545  13118  11540
4      09545  19210  93313
5      09545  13118     -9
6      29250  14930  09545
7      09545  11540  93433
8      09545  29250  14930
9      09545  19210  14930
10     09545  11540  14930
11     15600  09545  29250
12     09545  13118  14930
13     09545  13118  93433
14     09545  13118  29250
15     09545  10225  29250
16     09545  11540  13118
17     09545  19210     -9
18     09545  29250  14930
19     09545  19210  19218
20     09545  29250  41800
21     09545  14930  13118
22     13118  14930  29250
23     09545  15600  29250
24     07289     -9     -9
25     09645     -9     -9
26     10018  12222     -9
27     60550     -9     -9
28     60550     -9     -9
29        -9     -9     -9
...      ...    ...    ...
13135  17370  61245     -9
13136  25674     -9     -9
13137  93396     -9     -9
13138  91062  06985  89020
13139     -9     -9     -9
1

In [17]:
tmp[cols]

Unnamed: 0,VMONTH,VDAYR,AGE,AGER,AGEDAYS,SEX,PREGNANT,GESTWK,ETHUN,RACEUN,ETHIM,PAYTYPER,RFV1,MAJOR,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,HTIN,WTLB,BMI,TEMPF,BPSYS,BPDIAS,GLUCOSE,BONEDENS,SUBSTED,MED1,MED2,MED3,HTWTFL,YEAR,PATWT
0,3,2,4,1,-7,1,-9,-9,1,-9,1,2,33500,3,7455,,,,,42,42,16.74,974,-9,-9,0,0,0,09545,14930,13118,0,2016,241154.17224
1,3,2,6,1,-7,1,-9,-9,1,-9,1,3,65000,3,7454,4168,,,,43,48,18.25,976,-9,-9,0,0,0,09545,19210,14930,0,2016,241154.17224
2,3,2,3,1,-7,1,-9,-9,2,2,2,3,42050,5,4168,7470,,,,38,32,15.58,978,-9,-9,1,0,0,09545,15600,29250,0,2016,241154.17224
3,3,2,3,1,-7,1,-9,-9,2,1,2,3,25200,2,7454,,7852,,,38,36,17.53,978,-9,-9,1,0,0,09545,13118,11540,0,2016,241154.17224
4,3,2,5,1,-7,2,-7,-7,1,-9,1,3,14300,2,4168,7454,,,,41,46,19.24,974,-9,-9,0,0,0,09545,19210,93313,0,2016,241154.17224
5,3,2,5,1,-7,1,-9,-9,2,2,2,3,10150,3,7455,7459,7454,,,40,35,15.38,978,-9,-9,1,0,0,09545,13118,-9,0,2016,241154.17224
6,3,2,5,1,-7,1,-9,-9,2,1,2,3,10150,2,7455,,,,,39,40,18.49,980,-9,-9,1,0,0,29250,14930,09545,0,2016,241154.17224
7,3,2,8,1,-7,1,-9,-9,1,-9,1,3,14051,2,7455,7469,,,,49,56,16.4,980,-9,-9,1,0,0,09545,11540,93433,0,2016,241154.17224
8,3,2,6,1,-7,2,-7,-7,2,1,2,1,14150,2,7454,,,,,47,46,14.64,975,-9,-9,1,0,0,09545,29250,14930,0,2016,241154.17224
9,3,3,4,1,-7,1,-9,-9,1,-9,1,3,59300,2,7470,4179,7469,,,38,36,17.53,979,-9,-9,1,0,0,09545,19210,14930,0,2016,241154.17224


fix other values to be ints, not floats, where appropriate

In [24]:
def k(x):
    if str(x)[-2:]=='.0':
        return str(x)[:-2]
    else:
        return str(x)

for j in tmp.columns.tolist():
    tmp[j] =tmp[j].apply(k)

In [25]:
tmp[cols]

Unnamed: 0,VMONTH,VDAYR,AGE,AGER,AGEDAYS,SEX,PREGNANT,GESTWK,ETHUN,RACEUN,ETHIM,PAYTYPER,RFV1,MAJOR,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,HTIN,WTLB,BMI,TEMPF,BPSYS,BPDIAS,GLUCOSE,BONEDENS,SUBSTED,MED1,MED2,MED3,HTWTFL,YEAR,PATWT
0,3,2,4,1,-7,1,-9,-9,1,-9,1,2,33500,3,7455,,,,,42,42,16.74,974,-9,-9,0,0,0,09545,14930,13118,0,2016,241154.17224
1,3,2,6,1,-7,1,-9,-9,1,-9,1,3,65000,3,7454,4168,,,,43,48,18.25,976,-9,-9,0,0,0,09545,19210,14930,0,2016,241154.17224
2,3,2,3,1,-7,1,-9,-9,2,2,2,3,42050,5,4168,7470,,,,38,32,15.58,978,-9,-9,1,0,0,09545,15600,29250,0,2016,241154.17224
3,3,2,3,1,-7,1,-9,-9,2,1,2,3,25200,2,7454,,7852,,,38,36,17.53,978,-9,-9,1,0,0,09545,13118,11540,0,2016,241154.17224
4,3,2,5,1,-7,2,-7,-7,1,-9,1,3,14300,2,4168,7454,,,,41,46,19.24,974,-9,-9,0,0,0,09545,19210,93313,0,2016,241154.17224
5,3,2,5,1,-7,1,-9,-9,2,2,2,3,10150,3,7455,7459,7454,,,40,35,15.38,978,-9,-9,1,0,0,09545,13118,-9,0,2016,241154.17224
6,3,2,5,1,-7,1,-9,-9,2,1,2,3,10150,2,7455,,,,,39,40,18.49,980,-9,-9,1,0,0,29250,14930,09545,0,2016,241154.17224
7,3,2,8,1,-7,1,-9,-9,1,-9,1,3,14051,2,7455,7469,,,,49,56,16.4,980,-9,-9,1,0,0,09545,11540,93433,0,2016,241154.17224
8,3,2,6,1,-7,2,-7,-7,2,1,2,1,14150,2,7454,,,,,47,46,14.64,975,-9,-9,1,0,0,09545,29250,14930,0,2016,241154.17224
9,3,3,4,1,-7,1,-9,-9,1,-9,1,3,59300,2,7470,4179,7469,,,38,36,17.53,979,-9,-9,1,0,0,09545,19210,14930,0,2016,241154.17224


In [26]:
tmp.to_csv('NAMCS/namcs2016.csv')