In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

In [2]:
#send data to rename.ipynb to rename all categories from numbers into words
data = pd.read_csv('data/data_rename.csv')
print(data.shape)
data.head()


(18978, 29)


Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,...,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,Non-Expansion State,0.0,0.0,0.0,2A,2.0,0.0,1.0,0,0
1,Academic/Research Program,West,49.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,11.4,0,...,Early Expansion (before 1/2014),,0.0,,3A,2.0,1.0,1.0,0,1
2,Comprehensive Community Cancer Program,South,88.0,Female,White,Non-Hispanic,Medicare/Public,Suburban,32.0,0,...,Non-Expansion State,,0.0,,2B,3.0,0.0,3.0,0,0
3,Integrated Network Cancer Program,West,53.0,Female,White,Non-Hispanic,Medicare/Public,Suburban,,3 or more,...,Early Expansion (before 1/2014),,0.0,,2B,3.0,0.0,1.0,0,0
4,Community Cancer Program,South,53.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,3.9,0,...,Late Expansion States (after 1/2014),5400.0,0.0,1.0,3C,3.0,1.0,4.0,0,0


In [3]:
data['APR'].value_counts(normalize=True)

0.0    0.946517
1.0    0.053483
Name: APR, dtype: float64

In [4]:
data['Stage'].value_counts()

2A    6916
3A    3498
3C    3058
1     2518
2B    2107
3B     878
2        3
Name: Stage, dtype: int64

In [5]:
#ordinal encode the 'Stage' column
data.loc[data['Stage'] == '2A', 'Stage'] = 2
data.loc[data['Stage'] == '2B', 'Stage'] = 3
data.loc[data['Stage'] == '2C', 'Stage'] = 4
data.loc[data['Stage'] == '3A', 'Stage'] = 5
data.loc[data['Stage'] == '3B', 'Stage'] = 6
data.loc[data['Stage'] == '3C', 'Stage'] = 7
data['Stage'].value_counts()


2    6916
5    3498
7    3058
1    2518
3    2107
6     878
2       3
Name: Stage, dtype: int64

In [6]:
data['Charlson-Deyo Score'].value_counts()

0            15669
1             2187
3 or more      626
2              496
Name: Charlson-Deyo Score, dtype: int64

In [7]:
data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = 0
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = 1
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = 2
data.loc[data['Charlson-Deyo Score'] == '3 or more', 'Charlson-Deyo Score'] = 2
data['Charlson-Deyo Score'].value_counts()

0    15669
1     2187
2     1122
Name: Charlson-Deyo Score, dtype: int64

In [8]:
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
Lymphovascular Invasion,0.856097
"Radiation Dose, >30 Gy",0.588576
"Radiation Dose, Rads",0.588576
Tumor Size,0.260512
"Duration of Radiation, Days",0.240752
Median Income Quartile,0.108547
No High School Degree (%),0.106966
Distance From Facility,0.096638
Chemotherapy,0.044314
Spanish Hispanic Origin,0.042259


In [9]:
num_cols = ['Age', 'Distance From Facility', 'Tumor Size', 'Diagnosis/Radiation Interval, Days', 'Duration of Radiation, Days', 'Radiation Dose, Rads']
cat_cols = [x for x in data.columns if x not in num_cols]
cat_cols.remove('APR')
cat_cols.remove('YEAR_OF_DIAGNOSIS')

In [10]:
#set datatype of categorical columns to object
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
for col in cat_cols:
    data[col] = data[col].astype(str)
for x in cat_cols:
    data[x] = cat_imputer.fit_transform(data[x].values.reshape(-1,1))
num_imputer = SimpleImputer(strategy='median')
for x in num_cols:
    data[x] = num_imputer.fit_transform(data[x].values.reshape(-1,1))
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(5)

Unnamed: 0,0
Facility Type,0.0
"Duration of Radiation, Days",0.0
Keratinizing,0.0
"Facility Volume, Quartile",0.0
N Stage,0.0


In [11]:
data.head()

Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,...,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,Non-Expansion State,0.0,0.0,0.0,2,2.0,0.0,1.0,0,0
1,Academic/Research Program,West,49.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,11.4,0,...,Early Expansion (before 1/2014),5040.0,0.0,,5,2.0,1.0,1.0,0,1
2,Comprehensive Community Cancer Program,South,88.0,Female,White,Non-Hispanic,Medicare/Public,Suburban,32.0,0,...,Non-Expansion State,5040.0,0.0,,3,3.0,0.0,3.0,0,0
3,Integrated Network Cancer Program,West,53.0,Female,White,Non-Hispanic,Medicare/Public,Suburban,8.7,2,...,Early Expansion (before 1/2014),5040.0,0.0,,3,3.0,0.0,1.0,0,0
4,Community Cancer Program,South,53.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,3.9,0,...,Late Expansion States (after 1/2014),5400.0,0.0,1.0,7,3.0,1.0,4.0,0,0


In [12]:
pd.crosstab(data['Lymphovascular Invasion'], data['YEAR_OF_DIAGNOSIS'])

YEAR_OF_DIAGNOSIS,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Lymphovascular Invasion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
No,0,0,0,0,0,0,288,221,278,296,294,287,294,251,58,26
Yes,0,0,0,0,0,0,39,44,37,52,65,54,56,52,26,13
,798,829,934,1013,1134,1238,932,1132,1196,1296,1448,1418,1554,1102,197,26


In [13]:
pd.crosstab(data['Lymphovascular Invasion'], data['YEAR_OF_DIAGNOSIS'], normalize='columns')

YEAR_OF_DIAGNOSIS,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Lymphovascular Invasion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
No,0.0,0.0,0.0,0.0,0.0,0.0,0.228753,0.158196,0.183984,0.180049,0.162701,0.163161,0.154412,0.178648,0.206406,0.4
Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.030977,0.031496,0.024487,0.03163,0.035971,0.030699,0.029412,0.037011,0.092527,0.2
,1.0,1.0,1.0,1.0,1.0,1.0,0.74027,0.810308,0.791529,0.788321,0.801328,0.80614,0.816176,0.784342,0.701068,0.4


In [14]:
data.loc[data['Lymphovascular Invasion'] == 'No', 'Lymphovascular Invasion'] = 0
data.loc[data['Lymphovascular Invasion'] == 'Yes', 'Lymphovascular Invasion'] = 2
data.loc[data['Lymphovascular Invasion'] == 'nan', 'Lymphovascular Invasion'] = 1
data['Lymphovascular Invasion'].value_counts()

1    16247
0     2293
2      438
Name: Lymphovascular Invasion, dtype: int64

In [15]:
data['Median Income Quartile'].value_counts()

>=$46,000            5860
$35,000 - $45,999    3981
$30,000 - $34,999    3931
< $30,000            3146
nan                  2060
Name: Median Income Quartile, dtype: int64

In [16]:
data.loc[data['Median Income Quartile'] == 'nan', 'Median Income Quartile'] = 0
data.loc[data['Median Income Quartile'] == '< $30,000', 'Median Income Quartile'] = 1
data.loc[data['Median Income Quartile'] == '$30,000 - $34,999', 'Median Income Quartile'] = 2
data.loc[data['Median Income Quartile'] == '$35,000 - $45,999', 'Median Income Quartile'] = 3
data.loc[data['Median Income Quartile'] == '>=$46,000', 'Median Income Quartile'] = 4
data['Median Income Quartile'].value_counts()

4    5860
3    3981
2    3931
1    3146
0    2060
Name: Median Income Quartile, dtype: int64

In [17]:
data['No High School Degree (%)'].value_counts()

7.0-12.9%     4742
13.0-20.9%    4545
<7.0%         4265
>=21.0%       3396
nan           2030
Name: No High School Degree (%), dtype: int64

In [18]:
data.loc[data['No High School Degree (%)'] == 'nan', 'No High School Degree (%)'] = 0
data.loc[data['No High School Degree (%)'] == '<7.0%', 'No High School Degree (%)'] = 1
data.loc[data['No High School Degree (%)'] == '7.0-12.9%', 'No High School Degree (%)'] = 2
data.loc[data['No High School Degree (%)'] == '13.0-20.9%', 'No High School Degree (%)'] = 3
data.loc[data['No High School Degree (%)'] == '>=21.0%', 'No High School Degree (%)'] = 4
data['No High School Degree (%)'].value_counts()


2    4742
3    4545
1    4265
4    3396
0    2030
Name: No High School Degree (%), dtype: int64

In [19]:
no_encode = ['Charlseon-Deyo Score', 'Lymphovascular Invasion', 'Stage', 'YEAR_OF_DIAGNOSIS', 'Median Income Quartile', 'No High School Degree (%)']
for x in no_encode:
    try:
        cat_cols.remove(x)
    except ValueError:
        pass
for x in cat_cols:
    le = LabelEncoder()
    le.fit(data[x])
    keys = le.classes_
    values = le.transform(le.classes_)
    label_dict = dict(zip(keys, values))
    print('col:', x)
    print(label_dict)
    data[x] = le.transform(data[x])

col: Facility Type
{'Academic/Research Program': 0, 'Community Cancer Program': 1, 'Comprehensive Community Cancer Program': 2, 'Integrated Network Cancer Program': 3, 'nan': 4}
col: Facility Location
{'Midwest': 0, 'NE': 1, 'South': 2, 'West': 3, 'nan': 4}
col: Sex
{'Female': 0, 'Male': 1}
col: Race
{'Asian': 0, 'Black': 1, 'Other': 2, 'White': 3, 'nan': 4}
col: Spanish Hispanic Origin
{'Hispanic': 0, 'Non-Hispanic': 1, 'nan': 2}
col: Primary Payor
{'Medicaid': 0, 'Medicare/Public': 1, 'Not Insured': 2, 'Private Insurance or Managed Care': 3, 'nan': 4}
col: Urban/Rural Classification
{'Rural': 0, 'Suburban': 1, 'Urban': 2, 'nan': 3}
col: Charlson-Deyo Score
{'0': 0, '1': 1, '2': 2}
col: Grade
{'Cell type not determined, not stated or not applicable': 0, 'Grade I, Well differentiated': 1, 'Grade II, Moderately differentiated': 2, 'Grade III, Poorly differentiated': 3, 'Grade IV, Undifferentiated or anaplastic': 4, 'nan': 5}
col: Chemotherapy
{'Multi-agent': 0, 'None': 1, 'Single-agent'

In [20]:
#apply the standard scaler to the numerical columns
scaler = StandardScaler()
for x in num_cols:
    data[x] = scaler.fit_transform(data[x].values.reshape(-1,1))

In [21]:
print(data.shape[0] * 0.8)
print(data.shape[0] * 0.2)

15182.400000000001
3795.6000000000004


In [22]:
pd.crosstab(data['YEAR_OF_DIAGNOSIS'], data['APR'], normalize='index')

APR,0.0,1.0
YEAR_OF_DIAGNOSIS,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.929825,0.070175
2005,0.942099,0.057901
2006,0.946467,0.053533
2007,0.943731,0.056269
2008,0.939153,0.060847
2009,0.947496,0.052504
2010,0.945989,0.054011
2011,0.963493,0.036507
2012,0.965586,0.034414
2013,0.961679,0.038321


In [23]:
pd.crosstab(data['YEAR_OF_DIAGNOSIS'], data['APR'])

APR,0.0,1.0
YEAR_OF_DIAGNOSIS,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,742,56
2005,781,48
2006,884,50
2007,956,57
2008,1065,69
2009,1173,65
2010,1191,68
2011,1346,51
2012,1459,52
2013,1581,63


In [24]:
data['YEAR_OF_DIAGNOSIS'] = data['YEAR_OF_DIAGNOSIS'].astype(int)

In [25]:
data = data.loc[data['YEAR_OF_DIAGNOSIS'] != 2019]
# data = data.loc[data['YEAR_OF_DIAGNOSIS'] != 2018]

In [26]:
test = data.loc[data['YEAR_OF_DIAGNOSIS'] >= 2016]
train = data.loc[data['YEAR_OF_DIAGNOSIS'] < 2016]
print(train.shape, test.shape)

(15323, 29) (3590, 29)


In [27]:
print(test.shape[0] / data.shape[0])

0.1898165283138582


In [28]:
data.iloc[:,0:15].head()

Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,YEAR_OF_DIAGNOSIS,Grade,Tumor Size,Lymphovascular Invasion,"Diagnosis/Radiation Interval, Days"
0,0,0,-0.700299,0,3,1,2,2,-0.190978,0,2016,2,-0.084534,0,1.993489
1,0,3,-0.96907,0,3,1,3,2,-0.113262,0,2004,2,0.221891,1,-0.936185
2,2,2,2.524952,0,3,1,1,1,0.091987,0,2014,0,-0.135605,1,0.351096
3,3,3,-0.610709,0,3,1,1,1,-0.140164,2,2011,3,-0.135605,0,-0.980574
4,1,2,-0.610709,0,3,1,3,2,-0.187989,0,2015,2,1.498662,1,0.17354


In [29]:
data.iloc[:,15:30].head()

Unnamed: 0,"Duration of Radiation, Days",Chemotherapy,No High School Degree (%),Median Income Quartile,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,-0.16199,0,4,1,2,-2.181779,0.0,0,2,2,0,0,0,0
1,-0.16199,0,2,3,0,0.367529,0.0,2,5,2,1,0,0,1
2,2.392217,3,4,1,2,0.367529,0.0,2,3,3,0,2,0,0
3,-0.059822,0,0,0,0,0.367529,0.0,2,3,3,0,0,0,0
4,-0.264158,0,4,3,1,0.549623,0.0,1,7,3,1,3,0,0


In [30]:
test.to_csv('data/test.csv', index=False)
train.to_csv('data/train.csv', index=False)

In [31]:
train['YEAR_OF_DIAGNOSIS'].value_counts()

2014    1807
2015    1759
2013    1644
2012    1511
2011    1397
2010    1259
2009    1238
2008    1134
2007    1013
2006     934
2005     829
2004     798
Name: YEAR_OF_DIAGNOSIS, dtype: int64

In [32]:
print(train['APR'].value_counts())
print(train['APR'].value_counts(normalize=True))

0.0    14604
1.0      719
Name: APR, dtype: int64
0.0    0.953077
1.0    0.046923
Name: APR, dtype: float64


In [33]:
print(test['APR'].value_counts())
print(test['APR'].value_counts(normalize=True))

0.0    3359
1.0     231
Name: APR, dtype: int64
0.0    0.935655
1.0    0.064345
Name: APR, dtype: float64
