In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import seaborn as sn
from matplotlib import pyplot

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

In [3]:
include = ['CaseID', 'SEX', 'RACE_NEW', 'ETHNICITY_HISPANIC', 'PRNCPTX', 'CPT', 'WORKRVU', 'Age', 'ANESTHES', 'SURGSPEC', 'ELECTSURG', 'EMERGNCY', 'WNDCLAS', 'ASACLAS', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 'HEIGHT', 'WEIGHT', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'SEPSHOCKPATOS', 'OTHERCPT1', 'CONCPT1', 'INOUT', 'TRANST','OperYR',  'HtoODay', 'SSSIPATOS', 'DSSIPATOS', 'OSSIPATOS', 'PNAPATOS', 'VENTPATOS', 'UTIPATOS', 'SEPSISPATOS', 'SEPSHOCKPATOS', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2','CONCPT3','CONCPT4','CONCPT5','CONCPT6','CONCPT7','CONCPT8','CONCPT9','CONCPT10','RETURNOR','REOPORCPT1','REOPERATION1','REOPORICD91','REOPOR1ICD101','REOPERATION2','REOPOR2ICD91','REOPOR2ICD101','REOPOR2CPT1','PODIAG','PODIAGTX','PODIAG10','PODIAGTX10', 'PODIAG_OTHER', 'PODIAG_OTHER10']

col_features = ['COL_STEROID', 'COL_MECH_BOWEL_PREP', 'COL_ORAL_ANTIBIOTIC', 'COL_CHEMO', 'COL_INDICATION', 'COL_ICD9_INDICATION', 'COL_EMERGENT', 'COL_ICD9_EMERGENT', 'COL_APPROACH', 'COL_ICD10_INDICATION', 'COL_ICD10_EMERGENT']
incl = list(map(lambda x:x.upper(), include))
total = col_features + incl

In [4]:
CPT = ['OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2','CONCPT3','CONCPT4','CONCPT5','CONCPT6','CONCPT7','CONCPT8','CONCPT9','CONCPT10']

In [5]:
CPT_dict = {'OTHERCPT1': str, 'OTHERCPT2':str, 'OTHERCPT3':str, 'OTHERCPT4':str, 'OTHERCPT5':str, 'OTHERCPT6':str, 'OTHERCPT7':str, 'OTHERCPT8':str, 'OTHERCPT9':str, 'OTHERCPT10':str, 'CONCPT1':str, 'CONCPT2':str,'CONCPT3':str,'CONCPT4':str,'CONCPT5':str,'CONCPT6':str,'CONCPT7':str,'CONCPT8':str,'CONCPT9':str,'CONCPT10':str, 'REOPORCPT1':str, 'REOPOR2CPT1':str}

In [6]:
data = pd.read_csv(r'/home/kchen/Documents/nsqip_raw/procol/procol.csv', index_col='CASEID', usecols=total, dtype=CPT_dict, low_memory=False, na_values=[-99, '-99', 'Unknown'])

In [7]:
data.shape

(276232, 104)

In [8]:
strICDIU = ['867.2','867.3']
ICDIU = [867.2,867.3]
strICD10 = ['S37']

In [9]:
data['PODIAGTX10'].value_counts(dropna=False)

NaN                                                                                89883
Malignant neoplasm of rectum                                                       12398
Malignant neoplasm of ascending colon                                               9507
Malignant neoplasm of sigmoid colon                                                 9264
Diverticulitis of large intestine with perforation and abscess without bleeding     8286
                                                                                   ...  
DVRTCLOS OF SM INT W/O PERF OR ABSCESS W BLEED                                         1
OTHER HEMORRHOIDS                                                                      1
OTH COMPLICATIONS OF PROCEDURES, NEC, INIT                                             1
ADV EFFECT OF ANTINEOPL AND IMMUNOSUP DRUGS INIT                                       1
Foreign body in anus and rectum                                                        1
Name: PODIAGTX10, Len

In [10]:
data['URETER'] = 0

In [11]:
for icd in strICDIU:
    data.loc[data['PODIAG'].str.contains(icd, na=False, regex=False), 'URETER'] = 1
data['URETER'].value_counts()

0    276232
Name: URETER, dtype: int64

In [12]:
for icd in strICDIU:
    data.loc[data['PODIAG10'].str.contains('S37.1', na=False), 'URETER'] = 1
data['URETER'].value_counts()

0    276231
1         1
Name: URETER, dtype: int64

In [13]:
for icd in strICDIU:
    data.loc[data['REOPORICD91'].str.contains(icd, na=False), 'URETER'] = 1
data['URETER'].value_counts(dropna=False)

0    276218
1        14
Name: URETER, dtype: int64

In [14]:
for icd in strICDIU:
    data.loc[data['REOPOR1ICD101'].str.contains('S37', na=False), 'URETER'] = 1
data['URETER'].value_counts()

0    276173
1        59
Name: URETER, dtype: int64

In [15]:
IUI = [50740, 50750, 50760, 50770, 50780, 50783, 50785, 50800, 50810, 50815, 50820, 50825, 50840, 50845, 50860, 50900, 50947, 50948, 50949, 52334, 50040]
for CPT in IUI:
    data.loc[data['REOPORCPT1'] == CPT, 'URETER'] = 1
    data.loc[data['REOPOR2CPT1'] == CPT, 'URETER'] = 1
data['URETER'].value_counts()

0    276173
1        59
Name: URETER, dtype: int64

In [16]:
strIUI = []
for cpt in IUI:
    strIUI.append(str(cpt))

In [17]:
for CPT in strIUI:
    data.loc[data['REOPORCPT1'].str.contains(CPT, na=False), 'URETER'] = 1
    data.loc[data['REOPOR2CPT1'].str.contains(CPT, na=False), 'URETER'] = 1
data['URETER'].value_counts()

0    276146
1        86
Name: URETER, dtype: int64

In [18]:
for i in range(1,11):
    for CPT in strIUI:
        data.loc[data['OTHERCPT%s' % i].str.contains(CPT, na=False), 'URETER'] = 1
        data.loc[data['CONCPT%s' % i].str.contains(CPT, na=False), 'URETER'] = 1

In [19]:
data['URETER'].value_counts(dropna=False, normalize=True)

0    0.99411
1    0.00589
Name: URETER, dtype: float64

In [20]:
data.loc[data['URETER'].isna(), 'URETER'] = 0
data['URETER'].value_counts(dropna=False, normalize=True)

0    0.99411
1    0.00589
Name: URETER, dtype: float64

In [21]:
data['STENT'] = 0

In [22]:
for i in range(1,11):
    data.loc[data['OTHERCPT%s' % i] == 52005, 'STENT'] = 1
    data.loc[data['CONCPT%s' % i] == 52005, 'STENT'] = 1

In [23]:
for i in range(1,11):
    data.loc[data['OTHERCPT%s' % i].str.contains('52005', na=False), 'STENT'] = 1
    data.loc[data['CONCPT%s' % i].str.contains('52005', na=False), 'STENT'] = 1
data['STENT'].value_counts()

0    264003
1     12229
Name: STENT, dtype: int64

In [24]:
data.loc[data['STENT'].isna(), 'STENT'] = 0
data['STENT'].value_counts(dropna=False, normalize=True)

0    0.955729
1    0.044271
Name: STENT, dtype: float64

In [25]:
data.groupby('OPERYR')['URETER'].value_counts(normalize=True)

OPERYR  URETER
2012.0  0         0.994936
        1         0.005064
2013.0  0         0.992699
        1         0.007301
2014.0  0         0.993864
        1         0.006136
2015.0  0         0.992941
        1         0.007059
2016.0  0         0.994293
        1         0.005707
2017.0  0         0.994381
        1         0.005619
2018.0  0         0.994827
        1         0.005173
2019.0  0         0.994127
        1         0.005873
Name: URETER, dtype: float64

In [26]:
uirate = pd.DataFrame(data.groupby('PRNCPTX')['URETER'].value_counts(ascending=False))

In [27]:
uirate.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,URETER
PRNCPTX,URETER,Unnamed: 2_level_1
COLCT TOT ABDL W/O PRCTECT W/CONTINENT ILEOST,0,304
COLCT TOT ABDL W/O PRCTECT W/CONTINENT ILEOST,1,2
COLCT TOT ABDL W/O PRCTECT W/ILEOST/ILEOPXTS,0,6687
COLCT TOT ABDL W/O PRCTECT W/ILEOST/ILEOPXTS,1,24
COLCT TTL ABD W/PRCTECT ILEOANAL ANAST & RSVR,0,420
COLECTOMY PARTIAL W/ANASTOMOSIS,0,31932
COLECTOMY PARTIAL W/ANASTOMOSIS,1,147
COLECTOMY PRTL ABDOMINAL & TRANSANAL APPR,0,54
COLECTOMY PRTL ABDOMINAL & TRANSANAL APPROACH,0,335
COLECTOMY PRTL ABDOMINAL & TRANSANAL APPROACH,1,3


In [28]:
#Create BMI column
lbs_to_kg_ratio = 0.453592
inch_to_meter_ratio = 0.0254

data['HEIGHT'] *= inch_to_meter_ratio
data['WEIGHT'] *= lbs_to_kg_ratio

data['BMI'] = data['WEIGHT']/(data['HEIGHT']**2)

#BMI <10 set to 10, BMI >50 set to 50
data.loc[data['BMI'] > 60, 'BMI'] = 60
data.loc[data['BMI'] < 12, 'BMI'] = 12


In [29]:
#Convert 90+ to 91 and AGE to int32
data.loc[data['AGE'] == '90+', 'AGE'] = 91
pd.to_numeric(data['AGE'], downcast='integer')

CASEID
6629599     44.0
6629786     38.0
6630805     84.0
6630866     71.0
6631518     50.0
            ... 
10363850    53.0
10363894     NaN
10363959    65.0
10363964    41.0
10363966    72.0
Name: AGE, Length: 276232, dtype: float64

In [30]:
#condense 'CPT' categories to fewer/simpler categories
data.loc[data['CPT'] == 44204, 'COMBCPT'] = 'Laparoscopic partial colectomy'
data.loc[data['CPT'] == 44207, 'COMBCPT'] = 'Laparoscopic L colectomy'
data.loc[data['CPT'] == 44208, 'COMBCPT'] = 'Laparoscopic L colectomy'
data.loc[data['CPT'] == 44206, 'COMBCPT'] = 'Laparoscopic L colectomy'
data.loc[data['CPT'] == 44205, 'COMBCPT'] = 'Laparoscopic R colectomy'
data.loc[data['CPT'] == 44140, 'COMBCPT'] = 'Partial colectomy'
data.loc[data['CPT'] == 44144, 'COMBCPT'] = 'Partial colectomy'
data.loc[data['CPT'] == 44141, 'COMBCPT'] = 'Partial colectomy'
data.loc[data['CPT'] == 44160, 'COMBCPT'] = 'R colectomy'
data.loc[data['CPT'] == 44145, 'COMBCPT'] = 'L colectomy'
data.loc[data['CPT'] == 44143, 'COMBCPT'] = 'L colectomy'
data.loc[data['CPT'] == 44146, 'COMBCPT'] = 'L colectomy'
data.loc[data['CPT'] == 44146, 'COMBCPT'] = 'L colectomy'
data.loc[data['CPT'] == 44210, 'COMBCPT'] = 'Laparoscopic total colectomy'
data.loc[data['CPT'] == 44150, 'COMBCPT'] = 'Total colectomy'
data.loc[data['CPT'] == 44151, 'COMBCPT'] = 'Total colectomy'
data.loc[data['CPT'] == 44156, 'COMBCPT'] = 'Total colectomy'
data.loc[data['CPT'] == 44157, 'COMBCPT'] = 'Total colectomy'
data.loc[data['CPT'] == 44158, 'COMBCPT'] = 'Total colectomy'
data.loc[data['CPT'] == 45110, 'COMBCPT'] = 'APR'
data.loc[data['CPT'] == 45112, 'COMBCPT'] = 'APR'
data.loc[data['CPT'] == 45395, 'COMBCPT'] = 'Laparoscopic APR'
data.loc[data['CPT'] == 45130, 'COMBCPT'] = 'Proctectomy, perineal approach'
data.loc[data['CPT'] == 45123, 'COMBCPT'] = 'Proctectomy, perineal approach'
data.loc[data['CPT'] == 45135, 'COMBCPT'] = 'Proctectomy, perineal approach'
data.loc[data['CPT'] == 45121, 'COMBCPT'] = 'Proctectomy, perineal approach'
data.loc[data['CPT'] == 45395, 'COMBCPT'] = 'Laparoscopic LAR'
data.loc[data['CPT'] == 45111, 'COMBCPT'] = 'LAR'
data.loc[data['CPT'] == 45113, 'COMBCPT'] = 'LAR'
data.loc[data['CPT'] == 45119, 'COMBCPT'] = 'LAR'
data.loc[data['CPT'] == 45120, 'COMBCPT'] = 'LAR'
data.loc[data['CPT'] == 45397, 'COMBCPT'] = 'LAR'

data.loc[data['CPT'] == 44212, 'COMBCPT'] = 'Laparoscopic proctocolectomy'
data.loc[data['CPT'] == 44211, 'COMBCPT'] = 'Laparoscopic proctocolectomy'
data.loc[data['CPT'] == 44155, 'COMBCPT'] = 'Proctocolectomy'
data.loc[data['CPT'] == 44156, 'COMBCPT'] = 'Proctocolectomy'


data.loc[data['CPT'] == 45126, 'COMBCPT'] = 'Pelvic exenteration'

data.loc[data['CPT'] == 44147, 'COMBCPT'] = 'Colectomy, combined transanal approach'
data.loc[data['CPT'] == 45550, 'COMBCPT'] = 'Laparoscopic rectopexy'
data.loc[data['CPT'] == 45402, 'COMBCPT'] = 'Laparoscopic rectopexy'

data.loc[data['CPT'] == 45114, 'COMBCPT'] = 'Proctectomy, transsacral approach'
data.loc[data['CPT'] == 45160, 'COMBCPT'] = 'Proctectomy, transsacral approach'
data.loc[data['CPT'] == 45116, 'COMBCPT'] = 'Proctectomy, transsacral approach'

In [31]:
data.loc[data['COL_APPROACH'] == 'SILS', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'SILS w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'SILS w/ unplanned conversion to open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'Endoscopic', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Endoscopic w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Endoscopic w/ unplanned conversion to open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'NOTES', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'NOTES w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'NOTES w/ unplanned conversion to open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'Other MIS approach', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Other MIS approach w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Other MIS approach w/ unplanned conversion to open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'Hybrid', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Hybrid w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Hybrid w/ unplanned conversion to open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'Robotic', 'COL_APPROACH'] = 'Robotic'
data.loc[data['COL_APPROACH'] == 'Robotic w/ open assist', 'COL_APPROACH'] = 'Robotic'
data.loc[data['COL_APPROACH'] == 'Robotic w/ unplanned conversion to open', 'COL_APPROACH'] = 'Robotic'

data.loc[data['COL_APPROACH'] == 'Laparoscopic w/ open assist', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Laparoscopic Hand Assisted', 'COL_APPROACH'] = 'Laparoscopic'
data.loc[data['COL_APPROACH'] == 'Laparoscopic w/ unplanned conversion to Open', 'COL_APPROACH'] = 'Laparoscopic'

data.loc[data['COL_APPROACH'] == 'Open', 'COL_APPROACH'] = 'Open (planned)'

data['COL_APPROACH'].value_counts()

Laparoscopic                                    132108
Open (planned)                                  100463
Robotic                                          25555
Laparoscopic w/ unplanned conversion to open     17614
Other                                              405
Name: COL_APPROACH, dtype: int64

In [32]:
data.groupby('RACE_NEW')['ETHNICITY_HISPANIC'].value_counts()

RACE_NEW                             ETHNICITY_HISPANIC
American Indian or Alaska Native     No                      1029
                                     Yes                       92
Asian                                No                      8158
                                     Yes                       53
Black or African American            No                     23525
                                     Yes                      234
Native Hawaiian or Pacific Islander  No                       551
                                     Yes                       32
Unknown/Not Reported                 No                      5116
                                     Yes                     4394
White                                No                    190646
                                     Yes                     8937
Name: ETHNICITY_HISPANIC, dtype: int64

In [33]:
data['RACE'] = data['RACE_NEW']

In [34]:
data.loc[data['ETHNICITY_HISPANIC'] == 'Yes', 'RACE'] = 'Hispanic'
data['RACE'].value_counts(dropna=False)

White                                  194329
Unknown/Not Reported                    32965
Black or African American               24277
Hispanic                                13742
Asian                                    8244
NaN                                      1080
American Indian or Alaska Native         1039
Native Hawaiian or Pacific Islander       556
Name: RACE, dtype: int64

In [35]:
data['RACE'] = data['RACE'].fillna('Unknown/Not Reported')

#Replace 'Unknown/Not Reported' with 'Unknown'

data['RACE'] = data['RACE'].replace('Unknown/Not Reported', 'Unknown')
data['RACE'].value_counts(dropna=False)

White                                  194329
Unknown                                 34045
Black or African American               24277
Hispanic                                13742
Asian                                    8244
American Indian or Alaska Native         1039
Native Hawaiian or Pacific Islander       556
Name: RACE, dtype: int64

In [36]:
data.drop(['RACE_NEW', 'ETHNICITY_HISPANIC'], axis=1, inplace=True)

In [37]:
data['COL_INDICATION'].value_counts()

Colon cancer                         90518
Chronic diverticular disease         36189
Other-Enter ICD-10 for diagnosis     30063
Acute diverticulitis                 21806
Non-malignant polyp                  20128
Other-Enter ICD-9 for diagnosis      16393
Crohn's Disease                      16026
Colon cancer w/ obstruction          11920
Ulcerative colitis                    8688
Rectal cancer                         8571
Volvulus                              7141
Rectal prolapse                       2472
Bleeding                              1795
Enterocolitis (e.g. C. Difficile)     1471
Anal cancer                            420
Name: COL_INDICATION, dtype: int64

In [38]:
data.loc[data['COL_ICD9_INDICATION'] == '211.3', 'COL_INDICATION'] = 'Non-malignant polyp'
data['COL_INDICATION'].value_counts()

Colon cancer                         90515
Chronic diverticular disease         36189
Other-Enter ICD-10 for diagnosis     30063
Acute diverticulitis                 21806
Non-malignant polyp                  20851
Crohn's Disease                      16026
Other-Enter ICD-9 for diagnosis      15674
Colon cancer w/ obstruction          11919
Ulcerative colitis                    8688
Rectal cancer                         8571
Volvulus                              7141
Rectal prolapse                       2472
Bleeding                              1795
Enterocolitis (e.g. C. Difficile)     1471
Anal cancer                            420
Name: COL_INDICATION, dtype: int64

569.83 - perforation
560.9 - obstruction
557 - ischemic colitis
211.3 - polyp
596.1 - colovesicular fistula

K63.1 - perforation
K32.1 - ???
K57.20 - diverticulitis w/ perforation
K56.609 - obstruction, unspecified
K56.699 - obstruction, unspecified


In [39]:
data['COL_EMERGENT'].value_counts(dropna=False)

NaN                                                                             237496
Perforation                                                                      18366
Obstruction                                                                      10548
Other (enter ICD-10 code)                                                         3357
Toxic colitis (Toxic Megacolon, C. diff w/out perforation, Ischemic Colitis)      2668
Other (enter ICD-9 code)                                                          2014
Bleeding                                                                          1783
Name: COL_EMERGENT, dtype: int64

In [40]:
outcomes = ['RETURNOR','REOPORCPT1','REOPERATION1','REOPORICD91','REOPOR1ICD101','REOPERATION2','REOPOR2ICD91','REOPOR2ICD101','REOPOR2CPT1','PODIAG','PODIAGTX','PODIAG10','PODIAGTX10', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2','CONCPT3','CONCPT4','CONCPT5','CONCPT6','CONCPT7','CONCPT8','CONCPT9','CONCPT10', 'PODIAG_OTHER', 'PODIAG_OTHER10','STENT']

In [41]:
data.drop(outcomes, axis=1, inplace=True)

In [42]:
num_cols = ['AGE','HEIGHT','WEIGHT','BMI','PRSODM','PRBUN','PRCREAT','PRALBUM','PRBILI','PRSGOT','PRALKPH','PRWBC','PRHCT','PRPLATE','PRPTT','PRINR','PRPT','HTOODAY']
cat_cols = list(set(data.columns) - set(num_cols) - set(outcomes) - set(['CASEID','ETHNICITY_HISPANIC','RACE_NEW']))
cat_cols.append('RACE')

In [43]:
data_imputed = data.copy()

In [44]:
data_imputed.to_csv('table1_data.csv')

In [51]:
for col in cat_cols:
    data_imputed[col].fillna(value='Unknown', inplace=True)

In [52]:
data_imputed['OPERYR'].value_counts()

2019.0     51250
2018.0     46007
2017.0     42715
2016.0     40125
2015.0     31307
2014.0     25262
2013.0     21505
2012.0     16981
Unknown     1080
Name: OPERYR, dtype: int64

In [53]:
col_missing = data_imputed.isnull().sum()/data.shape[0]
col_missing = col_missing.sort_values(ascending=False)
col_missing = col_missing[col_missing > 0]
(pd.DataFrame(col_missing)).head(20)

Unnamed: 0,0
PRPT,0.992861
PRPTT,0.641273
PRINR,0.509622
PRSGOT,0.300458
PRALKPH,0.280985
PRBILI,0.280445
PRALBUM,0.278701
PRBUN,0.102696
PRSODM,0.07115
PRCREAT,0.064909


In [54]:
drop = ['PRPT', 'PRPTT','PRSGOT', 'PRALKPH','PRBILI','PRINR','PRALBUM']
data_imputed.drop(drop, axis=1, inplace=True)

In [55]:
num_cols = list(set(num_cols) - set(drop))

In [56]:
num_imputer = SimpleImputer(strategy = 'median')

# Imputation for numeric:
for ncol in num_cols:
    data_imputed[ncol] = num_imputer.fit_transform(data_imputed[ncol].values.reshape(-1, 1))

In [58]:
for col in num_cols:
    data_imputed[col] = RobustScaler().fit_transform(data_imputed[col].values.reshape(-1,1))

In [59]:
le=LabelEncoder()
for col in cat_cols:
        data_imputed[col] = le.fit_transform(data_imputed[col].astype(str))

In [60]:
data_imputed.groupby('OPERYR')['URETER'].value_counts()

OPERYR  URETER
0       0         16895
        1            86
1       0         21348
        1           157
2       0         25107
        1           155
3       0         31086
        1           221
4       0         39896
        1           229
5       0         42475
        1           240
6       0         45769
        1           238
7       0         50949
        1           301
8       0          1080
Name: URETER, dtype: int64

In [61]:
data_imputed = data_imputed[data_imputed['OPERYR'] != 8]
data_imputed.groupby('OPERYR')['URETER'].value_counts()

OPERYR  URETER
0       0         16895
        1            86
1       0         21348
        1           157
2       0         25107
        1           155
3       0         31086
        1           221
4       0         39896
        1           229
5       0         42475
        1           240
6       0         45769
        1           238
7       0         50949
        1           301
Name: URETER, dtype: int64

In [63]:
data_imputed.to_csv(r'ui_procol.csv')

In [64]:
data_imputed['OPERYR'].value_counts()
data19 = data_imputed[data_imputed['OPERYR'] == 7]

drop19 = data_imputed[data_imputed['OPERYR'] != 7]

data19 = data19.drop(['OPERYR'], axis=1)

drop19 = drop19.drop(['OPERYR'], axis=1)

In [65]:
data19.to_csv(r'procol_test.csv')
drop19.to_csv(r'procol_train.csv')


In [92]:
#dum_cols is cat_cols minus columns including variables including string CPT or ICD
dum_cols = cat_cols
for x in cat_cols:
    if 'CPT' in x or 'ICD' in x:
        dum_cols.remove(x)


In [95]:
dum_cols.remove('URETER')

In [98]:
dum_data = pd.get_dummies(data_imputed, columns=dum_cols)

In [99]:
test19 = dum_data[dum_data['OPERYR'] == 7]
train = dum_data[dum_data['OPERYR'] != 7]
test19 = test19.drop(['OPERYR'], axis=1)
train = train.drop(['OPERYR'], axis=1)

In [100]:
print(train.shape, test19.shape)

(223902, 218) (51250, 218)


In [101]:
test19.to_csv('procol_test_oh.csv')
train.to_csv('procol_train_oh.csv')