In [1]:
import pandas as pd
import numpy as np
from tableone import TableOne
#show more columns
pd.set_option('display.max_columns', None)

In [2]:
#create a dataframe with the counts and percentages of a variable by year
def by_year(df, col):
    cts = pd.crosstab(df['YEAR_OF_DIAGNOSIS'], df[col])
    percs = pd.crosstab(df['YEAR_OF_DIAGNOSIS'], df[col], normalize='index')
    percs = percs.round(2) * 100
    percs = percs.astype(str)
    cts = cts.astype(str)
    comb = cts + ' (' + percs + '%)'
    comb['total'] = df['YEAR_OF_DIAGNOSIS'].value_counts().sort_index()
    return comb
def by_stage(df, col):
    cts = pd.crosstab(df['stage'], df[col])
    percs = pd.crosstab(df['stage'], df[col], normalize='index')
    percs = percs.round(2) * 100
    percs = percs.astype(str)
    cts = cts.astype(str)
    comb = cts + ' (' + percs + '%)'
    comb['total'] = df['stage'].value_counts().sort_index()
    return comb

In [3]:
data = pd.read_csv('data/table1_data.csv', index_col=0)

In [4]:
data.head()

Unnamed: 0_level_0,FACILITY_TYPE_CD,FACILITY_LOCATION_CD,AGE,SEX,RACE,SPANISH_HISPANIC_ORIGIN,INSURANCE_STATUS,UR_CD_13,CROWFLY,CDCC_TOTAL_BEST,YEAR_OF_DIAGNOSIS,HISTOLOGY,GRADE,TUMOR_SIZE,LYMPH_VASCULAR_INVASION,DX_RAD_STARTED_DAYS,RAD_ELAPSED_RX_DAYS,RX_SUMM_CHEMO,NO_HSD_QUAR_2016,MED_INC_QUAR_2016,PUF_MEDICAID_EXPN_CODE,TOTAL_DOSE,APR,"Radiation Dose, >30 Gy",stage,t_stage,n_stage,m_stage,facs_quart
Case Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,3.0,5.0,52,2,1.0,0,0,2.0,3.6,0,2016,8070,2.0,38.0,0.0,86.0,,3.0,1.0,1.0,0,0.0,0.0,0.0,2A,2.0,0.0,0,1.0
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,3.0,9.0,49,2,1.0,0,1,1.0,11.4,0,2004,8083,2.0,44.0,,20.0,,3.0,3.0,3.0,2,,0.0,,3A,2.0,1.0,0,1.0
D7a846888-d7d5-42ab-bd7f-8dd4e268adba,2.0,3.0,88,2,1.0,0,3,4.0,32.0,0,2014,8070,9.0,,9.0,49.0,70.0,1.0,1.0,1.0,0,,0.0,,2B,3.0,0.0,0,3.0
D6a94c44a-86b9-4b34-8f6b-bcc0da65cc59,4.0,9.0,53,2,1.0,0,3,5.0,,3,2011,8070,3.0,,0.0,19.0,46.0,3.0,,,2,,0.0,,2B,3.0,0.0,0,1.0
D9733f795-06f9-44bd-a94c-cfff0110c969,1.0,7.0,53,2,1.0,0,1,3.0,3.9,0,2015,8070,2.0,69.0,9.0,45.0,44.0,3.0,1.0,3.0,3,5400.0,0.0,1.0,3C,3.0,1.0,0,4.0


In [5]:
data.shape

(18978, 29)

In [6]:
cols = data.columns.to_list()

In [7]:
cols

['FACILITY_TYPE_CD',
 'FACILITY_LOCATION_CD',
 'AGE',
 'SEX',
 'RACE',
 'SPANISH_HISPANIC_ORIGIN',
 'INSURANCE_STATUS',
 'UR_CD_13',
 'CROWFLY',
 'CDCC_TOTAL_BEST',
 'YEAR_OF_DIAGNOSIS',
 'HISTOLOGY',
 'GRADE',
 'TUMOR_SIZE',
 'LYMPH_VASCULAR_INVASION',
 'DX_RAD_STARTED_DAYS',
 'RAD_ELAPSED_RX_DAYS',
 'RX_SUMM_CHEMO',
 'NO_HSD_QUAR_2016',
 'MED_INC_QUAR_2016',
 'PUF_MEDICAID_EXPN_CODE',
 'TOTAL_DOSE',
 'APR',
 'Radiation Dose, >30 Gy',
 'stage',
 't_stage',
 'n_stage',
 'm_stage',
 'facs_quart']

In [8]:
cols = data.columns.to_list()
num_cols = ['AGE', 'CROWFLY', 'TUMOR_SIZE', 'Radiation Dose, Rads', 'DX_RAD_STARTED_DAYS', 'RAD_ELAPSED_DAYS', 'TOTAL_DOSE']
cat_cols = [x for x in cols if x not in num_cols]

In [9]:
#convert all values in cat_cols to strings except for NaN
for col in cat_cols:
    data[col] = data[col].astype(str)

data.loc[data['X'] == 'Y', 'X'] = 'Z'

data['X'].value_counts(dropna=False)

data.loc[data['X'] == 'nan', 'X'] = np.NaN
data['X'].fillna('Unknown', inplace=True)
data.loc[data['X'] == "Unknown", 'X'] = np.NaN
data['X'].value_counts(dropna=False)

In [10]:
data.rename(columns={'RX_SUMM_CHEMO':'Chemotherapy'}, inplace=True)

In [11]:
data.loc[data['Chemotherapy'] == '3.0', 'Chemotherapy'] = 'Multi-agent'
data.loc[data['Chemotherapy'] == '2.0', 'Chemotherapy'] = 'Single-agent'
data.loc[data['Chemotherapy'] == '1.0', 'Chemotherapy'] = np.NaN
data.loc[data['Chemotherapy'] == '0.0', 'Chemotherapy'] = 'None'
data.loc[data['Chemotherapy'] == '88.0', 'Chemotherapy'] = np.NaN
data.loc[data['Chemotherapy'] == '82.0', 'Chemotherapy'] = 'None'
data.loc[data['Chemotherapy'] == '87.0', 'Chemotherapy'] = 'None'
data.loc[data['Chemotherapy'] == '86.0', 'Chemotherapy'] = 'None'
data.loc[data['Chemotherapy'] == 'nan', 'Chemotherapy'] = np.NaN
data = data[data['Chemotherapy'] != '85.0']
data['Chemotherapy'].value_counts(dropna=False)

Multi-agent     15995
Single-agent     1598
NaN               841
None              544
Name: Chemotherapy, dtype: int64

In [12]:
data.rename({'FACILITY_LOCATION_CD': 'Facility Location'}, axis=1, inplace=True)

In [13]:
data['Facility Location'].value_counts()

3.0    4124
4.0    3235
9.0    2763
2.0    2666
6.0    1483
5.0    1316
1.0    1178
7.0     970
8.0     777
nan     466
Name: Facility Location, dtype: int64

In [14]:
data.loc[data['Facility Location'] == '1.0', 'Facility Location'] = 'NE'
data.loc[data['Facility Location'] == '2.0', 'Facility Location'] = 'NE'
data.loc[data['Facility Location'] == '3.0', 'Facility Location'] = 'South'
data.loc[data['Facility Location'] == '4.0', 'Facility Location'] = 'Midwest'
data.loc[data['Facility Location'] == '5.0', 'Facility Location'] = 'Midwest'
data.loc[data['Facility Location'] == '6.0', 'Facility Location'] = 'Midwest'
data.loc[data['Facility Location'] == '7.0', 'Facility Location'] = 'South'
data.loc[data['Facility Location'] == '8.0', 'Facility Location'] = 'West'
data.loc[data['Facility Location'] == '9.0', 'Facility Location'] = 'West'


In [15]:
data.loc[data['Facility Location'] == 'nan', 'Facility Location'] = np.NaN
data['Facility Location'].fillna('Unknown', inplace=True)
data.loc[data['Facility Location'] == "Unknown", 'Facility Location'] = np.NaN
data['Facility Location'].value_counts(dropna=False)

Midwest    6034
South      5094
NE         3844
West       3540
NaN         466
Name: Facility Location, dtype: int64

In [16]:
data.rename({'SEX':'Sex'}, axis=1, inplace=True)

In [17]:
data['Sex'].value_counts()


2    13872
1     5106
Name: Sex, dtype: int64

In [18]:
data.loc[data['Sex'] == '1', 'Sex'] = 'Male'
data.loc[data['Sex'] == '2', 'Sex'] = 'Female'
data.loc[data['Sex'] == 'nan', 'Sex'] = np.NaN
data['Sex'].fillna('Unknown', inplace=True)
data.loc[data['Sex'] == "Unknown", 'Sex'] = np.NaN
data['Sex'].value_counts(dropna=False)

Female    13872
Male       5106
Name: Sex, dtype: int64

In [19]:
data.rename({'RACE':'Race'}, axis=1, inplace=True)

In [20]:
data['Race'].value_counts()

1.0     16770
2.0      1732
nan       142
98.0      125
3.0        60
96.0       31
5.0        22
15.0       19
4.0        19
8.0        15
6.0        15
10.0        9
7.0         6
97.0        4
13.0        4
16.0        3
21.0        1
11.0        1
Name: Race, dtype: int64

#Specific labels from the PUF Data Dictionary

data.loc[data['Race'] == '1.0', 'Race'] = 'White'

data.loc[data['Race'] == '2.0', 'Race'] = 'Black'

data.loc[data['Race'] == '3.0', 'Race'] = 'American Indian, Aleutian, or Eskimo'

data.loc[data['Race'] == '4.0', 'Race'] = 'Chinese'
data.loc[data['Race'] == '5.0', 'Race'] = 'Japanese'
data.loc[data['Race'] == '8.0', 'Race'] = 'Korean'

data.loc[data['Race'] == '6.0', 'Race'] = 'Filipino'
data.loc[data['Race'] == '7.0', 'Race'] = 'Hawaiian'
data.loc[data['Race'] == '20.0', 'Race'] = 'Micronesian, NOS'
data.loc[data['Race'] == '22.0', 'Race'] = 'Guamanian, NOS'
data.loc[data['Race'] == '27.0', 'Race'] = 'Samoan'
data.loc[data['Race'] == '28.0', 'Race'] = 'Tongan'
data.loc[data['Race'] == '31.0', 'Race'] = 'Fiji Islander'
data.loc[data['Race'] == '32.0', 'Race'] = 'New Guinean'
data.loc[data['Race'] == '97.0', 'Race'] = 'Pacific Islander, NOS'

data.loc[data['Race'] == '10.0', 'Race'] = 'Vietnamese'
data.loc[data['Race'] == '11.0', 'Race'] = 'Laotian'
data.loc[data['Race'] == '12.0', 'Race'] = 'Hmong'
data.loc[data['Race'] == '13.0', 'Race'] = 'Kampuchean (including Khmer and Cambodian)'
data.loc[data['Race'] == '14.0', 'Race'] = 'Thai'
data.loc[data['Race'] == '30.0', 'Race'] = 'Melanesian, NOS'

data.loc[data['Race'] == '15.0', 'Race'] = 'Asian Indian or Pakistani, NOS (formerly code 09)'
data.loc[data['Race'] == '16.0', 'Race'] = 'Asian Indian'
data.loc[data['Race'] == '17.0', 'Race'] = 'Pakistani'

data.loc[data['Race'] == '96.0', 'Race'] = 'Other Asian, including Asian, NOS andOriental, NOS'
data.loc[data['Race'] == '98.0', 'Race'] = 'Other'

In [21]:
data.loc[data['Race'] == '1.0', 'Race'] = 'White'

data.loc[data['Race'] == '2.0', 'Race'] = 'Black'

data.loc[data['Race'] == '3.0', 'Race'] = 'Asian'

data.loc[data['Race'] == '4.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '5.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '8.0', 'Race'] = 'Asian'

data.loc[data['Race'] == '6.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '7.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '20.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '21.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '22.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '27.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '28.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '31.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '32.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '97.0', 'Race'] = 'Asian'

data.loc[data['Race'] == '10.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '11.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '12.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '13.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '14.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '30.0', 'Race'] = 'Asian'

data.loc[data['Race'] == '15.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '16.0', 'Race'] = 'Asian'
data.loc[data['Race'] == '17.0', 'Race'] = 'Asian'

data.loc[data['Race'] == '96.0', 'Race'] = 'Other'
data.loc[data['Race'] == '98.0', 'Race'] = 'Other'


data.loc[data['Race'] == 'nan', 'Race'] = np.NaN
data['Race'].fillna('Unknown', inplace=True)
data.loc[data['Race'] == "Unknown", 'Race'] = np.NaN
data['Race'].value_counts(dropna=False)

White    16770
Black     1732
Asian      178
Other      156
NaN        142
Name: Race, dtype: int64

In [22]:
data.rename({'SPANISH_HISPANIC_ORIGIN': 'Spanish Hispanic Origin'}, axis=1, inplace=True)

In [23]:
data['Spanish Hispanic Origin'].value_counts()

0    17333
9      802
6      484
1       88
2       84
4       77
3       38
5       28
7       27
8       17
Name: Spanish Hispanic Origin, dtype: int64

In [24]:
data.loc[data['Spanish Hispanic Origin'] == '0', 'Spanish Hispanic Origin'] = 'Non-Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '1', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '2', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '3', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '4', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '5', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '6', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '7', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '8', 'Spanish Hispanic Origin'] = 'Hispanic'
data.loc[data['Spanish Hispanic Origin'] == '9', 'Spanish Hispanic Origin'] = 'Unknown'

data.loc[data['Spanish Hispanic Origin'] == 'nan', 'Spanish Hispanic Origin'] = np.NaN
data['Spanish Hispanic Origin'].fillna('Unknown', inplace=True)
data.loc[data['Spanish Hispanic Origin'] == "Unknown", 'Spanish Hispanic Origin'] = np.NaN
data['Spanish Hispanic Origin'].value_counts(dropna=False)

Non-Hispanic    17333
Hispanic          843
NaN               802
Name: Spanish Hispanic Origin, dtype: int64

In [25]:
data.rename({'INSURANCE_STATUS': 'Primary Payor'}, axis=1, inplace=True)

In [26]:
data['Primary Payor'].value_counts()

1    9296
3    6548
2    1632
0     884
4     337
9     281
Name: Primary Payor, dtype: int64

In [27]:
data.loc[data['Primary Payor'] == '0', 'Primary Payor'] = 'Not Insured'
data.loc[data['Primary Payor'] == '1', 'Primary Payor'] = 'Private Insurance or Managed Care'
data.loc[data['Primary Payor'] == '2', 'Primary Payor'] = 'Medicaid'
data.loc[data['Primary Payor'] == '3', 'Primary Payor'] = 'Medicare/Public'
data.loc[data['Primary Payor'] == '4', 'Primary Payor'] = 'Medicare/Public'
data.loc[data['Primary Payor'] == '9', 'Primary Payor'] = 'Unknown'

data.loc[data['Primary Payor'] == 'nan', 'Primary Payor'] = np.NaN
data['Primary Payor'].fillna('Unknown', inplace=True)
data.loc[data['Primary Payor'] == "Unknown", 'Primary Payor'] = np.NaN
data['Primary Payor'].value_counts(dropna=False)

Private Insurance or Managed Care    9296
Medicare/Public                      6885
Medicaid                             1632
Not Insured                           884
NaN                                   281
Name: Primary Payor, dtype: int64

In [28]:
data.rename({'MED_INC_QUAR_2016': 'Median Income Quartile'}, axis=1, inplace=True)

In [29]:
data['Median Income Quartile'].value_counts()

4.0    5860
3.0    3981
2.0    3931
1.0    3146
nan    2060
Name: Median Income Quartile, dtype: int64

In [30]:
data.loc[data['Median Income Quartile'] == '1.0', 'Median Income Quartile'] = '< $30,000'
data.loc[data['Median Income Quartile'] == '2.0', 'Median Income Quartile'] = '$30,000 - $34,999'
data.loc[data['Median Income Quartile'] == '3.0', 'Median Income Quartile'] = '$35,000 - $45,999'
data.loc[data['Median Income Quartile'] == '4.0', 'Median Income Quartile'] = '>=$46,000'

data.loc[data['Median Income Quartile'] == 'nan', 'Median Income Quartile'] = np.NaN
data['Median Income Quartile'].fillna('Unknown', inplace=True)
data.loc[data['Median Income Quartile'] == "Unknown", 'Median Income Quartile'] = np.NaN
data['Median Income Quartile'].value_counts(dropna=False)

>=$46,000            5860
$35,000 - $45,999    3981
$30,000 - $34,999    3931
< $30,000            3146
NaN                  2060
Name: Median Income Quartile, dtype: int64

In [31]:
data.rename({'NO_HSD_QUAR_2016':'No High School Degree (%)'}, axis=1, inplace=True)

In [32]:
data['No High School Degree (%)'].value_counts()

3.0    4742
2.0    4545
4.0    4265
1.0    3396
nan    2030
Name: No High School Degree (%), dtype: int64

In [33]:
data.loc[data['No High School Degree (%)'] == '1.0', 'No High School Degree (%)'] = '>=21.0%'
data.loc[data['No High School Degree (%)'] == '2.0', 'No High School Degree (%)'] = '13.0-20.9%'
data.loc[data['No High School Degree (%)'] == '3.0', 'No High School Degree (%)'] = '7.0-12.9%'
data.loc[data['No High School Degree (%)'] == '4.0', 'No High School Degree (%)'] = '<7.0%'

data.loc[data['No High School Degree (%)'] == 'nan', 'No High School Degree (%)'] = np.NaN
data['No High School Degree (%)'].fillna('Unknown', inplace=True)
data.loc[data['No High School Degree (%)'] == "Unknown", 'No High School Degree (%)'] = np.NaN
data['No High School Degree (%)'].value_counts(dropna=False)

7.0-12.9%     4742
13.0-20.9%    4545
<7.0%         4265
>=21.0%       3396
NaN           2030
Name: No High School Degree (%), dtype: int64

In [34]:
data.rename({'UR_CD_13':'Urban/Rural Classification'}, axis=1, inplace=True)

In [35]:
data['Urban/Rural Classification'].value_counts()

1.0    9855
2.0    3878
3.0    1944
6.0     967
4.0     817
nan     530
7.0     360
5.0     317
8.0     161
9.0     149
Name: Urban/Rural Classification, dtype: int64

In [36]:
data.loc[data['Urban/Rural Classification'] == '1.0', 'Urban/Rural Classification'] = 'Urban'
data.loc[data['Urban/Rural Classification'] == '2.0', 'Urban/Rural Classification'] = 'Urban'
data.loc[data['Urban/Rural Classification'] == '3.0', 'Urban/Rural Classification'] = 'Urban'
data.loc[data['Urban/Rural Classification'] == '4.0', 'Urban/Rural Classification'] = 'Suburban'
data.loc[data['Urban/Rural Classification'] == '5.0', 'Urban/Rural Classification'] = 'Suburban'
data.loc[data['Urban/Rural Classification'] == '6.0', 'Urban/Rural Classification'] = 'Suburban'
data.loc[data['Urban/Rural Classification'] == '7.0', 'Urban/Rural Classification'] = 'Rural'
data.loc[data['Urban/Rural Classification'] == '8.0', 'Urban/Rural Classification'] = 'Rural'
data.loc[data['Urban/Rural Classification'] == '9.0', 'Urban/Rural Classification'] = 'Rural'

data.loc[data['Urban/Rural Classification'] == 'nan', 'Urban/Rural Classification'] = np.NaN
data['Urban/Rural Classification'].fillna('Unknown', inplace=True)
data.loc[data['Urban/Rural Classification'] == "Unknown", 'Urban/Rural Classification'] = np.NaN
data['Urban/Rural Classification'].value_counts(dropna=False)

Urban       15677
Suburban     2101
Rural         670
NaN           530
Name: Urban/Rural Classification, dtype: int64

In [37]:
data.rename({'CDCC_TOTAL_BEST':'Charlson-Deyo Score'}, axis=1, inplace=True)

In [38]:
data['Charlson-Deyo Score'].value_counts()


0    15669
1     2187
3      626
2      496
Name: Charlson-Deyo Score, dtype: int64

In [39]:
data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = '0'
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = '1'
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = '2'
data.loc[data['Charlson-Deyo Score'] == '3', 'Charlson-Deyo Score'] = '3 or more'


data.loc[data['Charlson-Deyo Score'] == 'nan', 'Charlson-Deyo Score'] = np.NaN
data['Charlson-Deyo Score'].fillna('Unknown', inplace=True)
data.loc[data['Charlson-Deyo Score'] == "Unknown", 'Charlson-Deyo Score'] = np.NaN
data['Charlson-Deyo Score'].value_counts(dropna=False)

0            15669
1             2187
3 or more      626
2              496
Name: Charlson-Deyo Score, dtype: int64

In [40]:
data.rename({'HISTOLOGY': 'Histology'}, axis=1, inplace=True)

In [41]:
data['Histology'].value_counts()


8070    14607
8083     1478
8071     1272
8072      683
8124      482
8123      226
8010       81
8560       31
8051       27
8481       25
8076       23
8073       18
8000       14
8074        6
8120        3
8075        2
Name: Histology, dtype: int64

In [42]:
data.loc[data['Histology'] == '8000', 'Histology'] = 'Neoplasm, benign'
data.loc[data['Histology'] == '8010', 'Histology'] = 'Carcinoma, NOS'
data.loc[data['Histology'] == '8012', 'Histology'] = 'Large cell carcinoma, NOS'
data.loc[data['Histology'] == '8013', 'Histology'] = 'Large cell neuroendocrine carcinoma'
data.loc[data['Histology'] == '8014', 'Histology'] = 'Large cell carcinoma with rhabdoid phenotype'
data.loc[data['Histology'] == '8020', 'Histology'] = 'Dedifferentiated carcinoma'
data.loc[data['Histology'] == '8021', 'Histology'] = 'Carcinoma, anaplastic, NOS'
data.loc[data['Histology'] == '8022', 'Histology'] = 'Pleomorphic carcinoma'
data.loc[data['Histology'] == '8033', 'Histology'] = 'Pseudosarcomatous carcinoma'
data.loc[data['Histology'] == '8041', 'Histology'] = 'Small cell carcinoma, NOS'
data.loc[data['Histology'] == '8046', 'Histology'] = 'Non-small cell carcinoma'
data.loc[data['Histology'] == '8070', 'Histology'] = 'Squamous cell carcinoma in situ, NOS'
data.loc[data['Histology'] == '8071', 'Histology'] = 'Sq. cell carcinoma, keratinizing, NOS'
data.loc[data['Histology'] == '8072', 'Histology'] = 'Basaloid squamous cell carcinoma'
data.loc[data['Histology'] == '8074', 'Histology'] = 'Squamous cell carcinoma, spindle cell'
data.loc[data['Histology'] == '8083', 'Histology'] = 'Basaloid squamous cell carcinoma'
data.loc[data['Histology'] == '8084', 'Histology'] = 'Clear cell acanthoma'
data.loc[data['Histology'] == '8120', 'Histology'] = 'Urothelial papilloma, NOS'
data.loc[data['Histology'] == '8123', 'Histology'] = 'Basaloid carcinoma'
data.loc[data['Histology'] == '8124', 'Histology'] = 'Cloacogenic carcinoma'
data.loc[data['Histology'] == '8140', 'Histology'] = 'Adenoma, NOS'
data.loc[data['Histology'] == '8144', 'Histology'] = 'Adenoma, intestinal type'
data.loc[data['Histology'] == '8201', 'Histology'] = 'Cribriform carcinoma in situ'
data.loc[data['Histology'] == '8210', 'Histology'] = 'Adenomatous polyp, NOS'
data.loc[data['Histology'] == '8211', 'Histology'] = 'Tubular adenoma, NOS'
data.loc[data['Histology'] == '8213', 'Histology'] = 'Serrated adenoma, NOS'
data.loc[data['Histology'] == '8220', 'Histology'] = 'Adenomatous polyposis coli'
data.loc[data['Histology'] == '8221', 'Histology'] = 'Multiple adenomatous polyps'
data.loc[data['Histology'] == '8230', 'Histology'] = 'Ductal carcinoma in situ, solid type'
data.loc[data['Histology'] == '8240', 'Histology'] = 'Neuroendocrine tumor, NOS'
data.loc[data['Histology'] == '8243', 'Histology'] = 'Goblet cell carcinoid'
data.loc[data['Histology'] == '8244', 'Histology'] = 'Mixed adenoneuroendocrine carcinoma'
data.loc[data['Histology'] == '8245', 'Histology'] = 'Tubular carcinoid'
data.loc[data['Histology'] == '8246', 'Histology'] = 'Neuroendocrine carcinoma, NOS'
data.loc[data['Histology'] == '8249', 'Histology'] = 'Neuroendocrine tumor, grade 2'
data.loc[data['Histology'] == '8255', 'Histology'] = 'Adenocarcinoma with mixed subtypes'
data.loc[data['Histology'] == '8260', 'Histology'] = 'Papillary adenoma, NOS'
data.loc[data['Histology'] == '8261', 'Histology'] = 'Villous adenoma, NOS'
data.loc[data['Histology'] == '8262', 'Histology'] = 'Villous adenocarcinoma'
data.loc[data['Histology'] == '8263', 'Histology'] = 'Tubulovillous adenoma, NOS'
data.loc[data['Histology'] == '8310', 'Histology'] = 'Clear cell adenoma'
data.loc[data['Histology'] == '8323', 'Histology'] = 'Mixed cell adenoma'
data.loc[data['Histology'] == '8401', 'Histology'] = 'Apocrine adenoma'
data.loc[data['Histology'] == '8410', 'Histology'] = 'Sebaceoma'
data.loc[data['Histology'] == '8470', 'Histology'] = 'Mucinous cystadenoma, NOS'
data.loc[data['Histology'] == '8480', 'Histology'] = 'Mucinous adenoma'
data.loc[data['Histology'] == '8481', 'Histology'] = 'Mucin-producing adenocarcinoma'
data.loc[data['Histology'] == '8482', 'Histology'] = 'Mucinous carcinoma, gastric type'
data.loc[data['Histology'] == '8490', 'Histology'] = 'Signet ring cell carcinoma'
data.loc[data['Histology'] == '8507', 'Histology'] = 'Intraductal micropapillary carcinoma'
data.loc[data['Histology'] == '8510', 'Histology'] = 'Medullary carcinoma, NOS'
data.loc[data['Histology'] == '8542', 'Histology'] = 'Paget disease, extramammary'
data.loc[data['Histology'] == '8560', 'Histology'] = 'Adenosquamous carcinoma'
data.loc[data['Histology'] == '8570', 'Histology'] = 'Adenocarcinoma with squamous metaplasia'
data.loc[data['Histology'] == '8571', 'Histology'] = 'Adenocarcinoma with cartilaginous and osseous metaplasia'
data.loc[data['Histology'] == '8574', 'Histology'] = 'Adenocarcinoma with neuroendocrine differentiation'
data.loc[data['Histology'] == '8576', 'Histology'] = 'Hepatoid adenocarcinoma'
data.loc[data['Histology'] == '8936', 'Histology'] = 'Gastrointestinal stromal tumor'


data.loc[data['Histology'] == 'nan', 'Histology'] = np.NaN
data['Histology'].fillna('Unknown', inplace=True)
data.loc[data['Histology'] == "Unknown", 'Histology'] = np.NaN
data['Histology'].value_counts(dropna=False)

Squamous cell carcinoma in situ, NOS     14607
Basaloid squamous cell carcinoma          2161
Sq. cell carcinoma, keratinizing, NOS     1272
Cloacogenic carcinoma                      482
Basaloid carcinoma                         226
Carcinoma, NOS                              81
Adenosquamous carcinoma                     31
8051                                        27
Mucin-producing adenocarcinoma              25
8076                                        23
8073                                        18
Neoplasm, benign                            14
Squamous cell carcinoma, spindle cell        6
Urothelial papilloma, NOS                    3
8075                                         2
Name: Histology, dtype: int64

In [43]:
data['Keratinizing'] = data['Histology']
data.loc[data['Keratinizing'].str.contains('Keratinizing', case=False, na=False), 'Keratinizing'] = 1
data.loc[data['Keratinizing'] != 1, 'Keratinizing'] = 0
data['Keratinizing'].value_counts(dropna=False, normalize=True)

0    0.932975
1    0.067025
Name: Keratinizing, dtype: float64

In [44]:
data['Basaloid'] = data['Histology']
data.loc[data['Basaloid'].str.contains('Basaloid', case=False, na=False), 'Basaloid'] = 1
data.loc[data['Basaloid'] != 1, 'Basaloid'] = 0
data['Basaloid'].value_counts(dropna=False, normalize=True)

0    0.874223
1    0.125777
Name: Basaloid, dtype: float64

In [45]:
data.drop('Histology', axis=1, inplace=True)

In [46]:
data.rename(columns=({'GRADE': 'Grade'}), inplace=True)

In [47]:
data['Grade'].value_counts()

2.0    6324
9.0    5931
3.0    5294
1.0    1196
9       116
4.0     110
H         7
Name: Grade, dtype: int64

In [48]:
data.loc[data['Grade'] == '1.0', 'Grade'] = 'Grade I, Well differentiated'
data.loc[data['Grade'] == '2.0', 'Grade'] = 'Grade II, Moderately differentiated'
data.loc[data['Grade'] == '3.0', 'Grade'] = 'Grade III, Poorly differentiated'
data.loc[data['Grade'] == '4.0', 'Grade'] = 'Grade IV, Undifferentiated or anaplastic'
data.loc[data['Grade'] == '9.0', 'Grade'] = 'Cell type not determined, not stated or not applicable'
data.loc[data['Grade'] == '9', 'Grade'] = 'Cell type not determined, not stated or not applicable'

data.loc[data['Grade'] == 'H', 'Grade'] = np.NaN
data.loc[data['Grade'] == 'nan', 'Grade'] = np.NaN
data['Grade'].fillna('Unknown', inplace=True)
data.loc[data['Grade'] == "Unknown", 'Grade'] = np.NaN
data['Grade'].value_counts(dropna=False)

Grade II, Moderately differentiated                       6324
Cell type not determined, not stated or not applicable    6047
Grade III, Poorly differentiated                          5294
Grade I, Well differentiated                              1196
Grade IV, Undifferentiated or anaplastic                   110
NaN                                                          7
Name: Grade, dtype: int64

In [49]:
data.rename(columns=({'LYMPH_VASCULAR_INVASION': 'Lymphovascular Invasion'}), inplace=True)

In [50]:
data['Lymphovascular Invasion'].value_counts()


9.0    10300
nan     5946
0.0     2293
1.0      438
8.0        1
Name: Lymphovascular Invasion, dtype: int64

In [51]:
data.loc[data['Lymphovascular Invasion'] == '0.0', 'Lymphovascular Invasion'] = 'No'
data.loc[data['Lymphovascular Invasion'] == '1.0', 'Lymphovascular Invasion'] = 'Yes'
data.loc[data['Lymphovascular Invasion'] == '8.0', 'Lymphovascular Invasion'] = 'Unknown'
data.loc[data['Lymphovascular Invasion'] == '9.0', 'Lymphovascular Invasion'] = 'Unknown'
data.loc[data['Lymphovascular Invasion'] == 'nan', 'Lymphovascular Invasion'] = 'Unknown'


data.loc[data['Lymphovascular Invasion'] == 'nan', 'Lymphovascular Invasion'] = np.NaN
data['Lymphovascular Invasion'].fillna('Unknown', inplace=True)
data.loc[data['Lymphovascular Invasion'] == "Unknown", 'Lymphovascular Invasion'] = np.NaN
data['Lymphovascular Invasion'].value_counts(dropna=False)

NaN    16247
No      2293
Yes      438
Name: Lymphovascular Invasion, dtype: int64

In [52]:
data.rename(columns=({'PUF_MEDICAID_EXPN_CODE': 'Medicaid Expansion'}), inplace=True)

In [53]:
data.loc[data['Medicaid Expansion'] == '0', 'Medicaid Expansion'] = 'Non-Expansion State'
data.loc[data['Medicaid Expansion'] == '1', 'Medicaid Expansion'] = 'Early Expansion (before 1/2014)'
data.loc[data['Medicaid Expansion'] == '2', 'Medicaid Expansion'] = 'Early Expansion (before 1/2014)'
data.loc[data['Medicaid Expansion'] == '3', 'Medicaid Expansion'] = 'Late Expansion States (after 1/2014)'
data.loc[data['Medicaid Expansion'] == '9', 'Medicaid Expansion'] = 'Suppressed for Ages 0-39'

data.loc[data['Medicaid Expansion'] == 'nan', 'Medicaid Expansion'] = np.NaN
data.loc[data['Medicaid Expansion'] == 'None assigned', 'Medicaid Expansion'] = np.NaN
data['Medicaid Expansion'].fillna('Unknown', inplace=True)
data.loc[data['Medicaid Expansion'] == "Unknown", 'Medicaid Expansion'] = np.NaN
data['Medicaid Expansion'].value_counts(dropna=False)

Early Expansion (before 1/2014)         9420
Non-Expansion State                     6701
Late Expansion States (after 1/2014)    2391
Suppressed for Ages 0-39                 466
Name: Medicaid Expansion, dtype: int64

In [54]:
data.rename(columns=({'FACILITY_TYPE_CD': 'Facility Type'}), inplace=True)

In [55]:
data.loc[data['Facility Type'] == '1.0', 'Facility Type'] = 'Community Cancer Program'
data.loc[data['Facility Type'] == '2.0', 'Facility Type'] = 'Comprehensive Community Cancer Program'
data.loc[data['Facility Type'] == '3.0', 'Facility Type'] = 'Academic/Research Program'
data.loc[data['Facility Type'] == '4.0', 'Facility Type'] = 'Integrated Network Cancer Program'

data.loc[data['Facility Type'] == 'nan', 'Facility Type'] = 'Unknown'
data.loc[data['Facility Type'] == 'None assigned', 'Facility Type'] = np.NaN
data['Facility Type'].fillna('Unknown', inplace=True)
data.loc[data['Facility Type'] == "Unknown", 'Facility Type'] = np.NaN
data['Facility Type'].value_counts(dropna=False)

Comprehensive Community Cancer Program    7399
Academic/Research Program                 6183
Integrated Network Cancer Program         3601
Community Cancer Program                  1329
NaN                                        466
Name: Facility Type, dtype: int64

In [56]:
data.loc[data['Facility Type'] == '1.0', 'Facility Type'] = 'Community Cancer Program'
data.loc[data['Facility Type'] == '2.0', 'Facility Type'] = 'Comprehensive Community Cancer Program'
data.loc[data['Facility Type'] == '3.0', 'Facility Type'] = 'Academic/Research Program'
data.loc[data['Facility Type'] == '4.0', 'Facility Type'] = 'Integrated Network Cancer Program'

data.loc[data['Facility Type'] == 'nan', 'Facility Type'] = 'Unknown'
data.loc[data['Facility Type'] == 'None assigned', 'Facility Type'] = np.NaN
data['Facility Type'].fillna('Unknown', inplace=True)
data.loc[data['Facility Type'] == "Unknown", 'Facility Type'] = np.NaN
data['Facility Type'].value_counts(dropna=False)

Comprehensive Community Cancer Program    7399
Academic/Research Program                 6183
Integrated Network Cancer Program         3601
Community Cancer Program                  1329
NaN                                        466
Name: Facility Type, dtype: int64

In [57]:
data.rename(columns=({'DX_RAD_STARTED_DAYS': 'Diagnosis/Radiation Interval, Days', 'RAD_ELAPSED_RX_DAYS': 'Duration of Radiation, Days', 'TOTAL_DOSE': 'Radiation Dose, Rads'}), inplace=True)

In [58]:
data.head()

Unnamed: 0_level_0,Facility Type,Facility Location,AGE,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,CROWFLY,Charlson-Deyo Score,YEAR_OF_DIAGNOSIS,Grade,TUMOR_SIZE,Lymphovascular Invasion,"Diagnosis/Radiation Interval, Days","Duration of Radiation, Days",Chemotherapy,No High School Degree (%),Median Income Quartile,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",stage,t_stage,n_stage,m_stage,facs_quart,Keratinizing,Basaloid
Case Key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,Academic/Research Program,Midwest,52,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,2016,"Grade II, Moderately differentiated",38.0,No,86.0,,Multi-agent,>=21.0%,"< $30,000",Non-Expansion State,0.0,0.0,0.0,2A,2.0,0.0,0,1.0,0,0
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,Academic/Research Program,West,49,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,11.4,0,2004,"Grade II, Moderately differentiated",44.0,,20.0,,Multi-agent,7.0-12.9%,"$35,000 - $45,999",Early Expansion (before 1/2014),,0.0,,3A,2.0,1.0,0,1.0,0,1
D7a846888-d7d5-42ab-bd7f-8dd4e268adba,Comprehensive Community Cancer Program,South,88,Female,White,Non-Hispanic,Medicare/Public,Suburban,32.0,0,2014,"Cell type not determined, not stated or not ap...",,,49.0,70.0,,>=21.0%,"< $30,000",Non-Expansion State,,0.0,,2B,3.0,0.0,0,3.0,0,0
D6a94c44a-86b9-4b34-8f6b-bcc0da65cc59,Integrated Network Cancer Program,West,53,Female,White,Non-Hispanic,Medicare/Public,Suburban,,3 or more,2011,"Grade III, Poorly differentiated",,No,19.0,46.0,Multi-agent,,,Early Expansion (before 1/2014),,0.0,,2B,3.0,0.0,0,1.0,0,0
D9733f795-06f9-44bd-a94c-cfff0110c969,Community Cancer Program,South,53,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,3.9,0,2015,"Grade II, Moderately differentiated",69.0,,45.0,44.0,Multi-agent,>=21.0%,"$35,000 - $45,999",Late Expansion States (after 1/2014),5400.0,0.0,1.0,3C,3.0,1.0,0,4.0,0,0


In [59]:
num_cols

['AGE',
 'CROWFLY',
 'TUMOR_SIZE',
 'Radiation Dose, Rads',
 'DX_RAD_STARTED_DAYS',
 'RAD_ELAPSED_DAYS',
 'TOTAL_DOSE']

In [60]:
data.rename({'AGE': 'Age', 'CROWFLY': 'Distance From Facility', 'TUMOR_SIZE': 'Tumor Size'}, axis=1, inplace=True)

In [61]:
num_cols = ['Age', 'Distance From Facility', 'Tumor Size', 'Diagnosis/Radiation Interval, Days', 'Duration of Radiation, Days', 'Radiation Dose, Rads']

In [62]:
for x in num_cols:
    data.loc[data[x] == 'Unknown', x] = np.NaN

In [63]:
for x in data.columns.to_list():
    data.loc[data[x] == 'Unknown', x] = np.NaN
    data.loc[data[x] == 'nan', x] = np.NaN

In [64]:
data.rename(columns=({'stage': 'Stage',
                      't_stage': 'T Stage',
                      'n_stage': 'N Stage',
                      'facs_quart' : 'Facility Volume, Quartile'
                      }), inplace=True)

In [65]:
data.drop(['m_stage'], axis=1, inplace=True)

In [66]:
data['YEAR_OF_DIAGNOSIS'] = data['YEAR_OF_DIAGNOSIS'].astype(int)

In [67]:
data.shape

(18978, 29)

In [68]:
data.to_csv('data/data_rename.csv', index=False)

In [69]:
data['APR'].value_counts()

0.0    17963
1.0     1015
Name: APR, dtype: int64

In [70]:
data['APR'].value_counts(normalize=True)

0.0    0.946517
1.0    0.053483
Name: APR, dtype: float64

In [71]:
cols = data.columns.to_list()
num_cols = ['Age', 'Distance From Facility', 'Tumor Size', 'Diagnosis/Radiation Interval, Days', 'Duration of Radiation, Days', 'Radiation Dose, Rads']
cat_cols = [x for x in cols if x not in num_cols]

In [72]:
incl = data.columns.to_list()
incl.remove('APR')
cat_cols.remove('APR')

In [73]:
tableone = TableOne(data, columns=incl, categorical=cat_cols, groupby='APR', pval=True)
tableone.to_excel('results/table1_apr.xlsx')

  df['percent'] = df['freq'].div(df.freq.sum(level=0),
  df['percent'] = df['freq'].div(df.freq.sum(level=0),
