In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

In [2]:
#send data to rename.ipynb to rename all categories from numbers into words
data = pd.read_csv('data/data_os_clean.csv', index_col=0)
print(data.shape)
data.head()


(33790, 35)


Unnamed: 0_level_0,Facility Type,Facility Location,Age,Sex,Race,Hispanic Ethnicity,Insurance Status,Urban/Rural,Distance from Hospital,Charlson-Deyo Score,...,Stage,T Stage,N Stage,M Stage,Pathologic Stage,Facility Volume Quartile,3-yr Mortality,Keratinizing,Basaloid,Cloacogenic
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,2A,2.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,Academic/Research Program,West,49.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,11.4,0,...,3A,2.0,1.0,0,No surgery,1.0,0.0,Not Keratinizing,Basaloid,Not Cloacogenic
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,Academic/Research Program,Midwest,58.0,Male,White,Non-Hispanic,Medicaid,Urban,14.0,0,...,2B,3.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
D62170955-5fc5-434e-8e6a-6a9e139851c4,Academic/Research Program,Midwest,80.0,Female,White,Non-Hispanic,Medicare/Public,Urban,,0,...,1,1.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,Integrated Network Cancer Program,Midwest,42.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,12.7,0,...,1,1.0,0.0,0,No surgery,4.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic


In [3]:
data['Stage'].value_counts()

2A    11551
3A     5748
3C     5601
1      5557
2B     3685
3B     1648
Name: Stage, dtype: int64

In [4]:
data.loc[data['Stage'] == '1', 'Stage'] = 0
data.loc[data['Stage'] == '2A', 'Stage'] = 1
data.loc[data['Stage'] == '2B', 'Stage'] = 2
data.loc[data['Stage'] == '3A', 'Stage'] = 3
data.loc[data['Stage'] == '3B', 'Stage'] = 4
data.loc[data['Stage'] == '3C', 'Stage'] = 5
data['Stage'].value_counts()

1    11551
3     5748
5     5601
0     5557
2     3685
4     1648
Name: Stage, dtype: int64

In [5]:
data['Charlson-Deyo Score'].value_counts()

0            26922
1             4201
3 or more     1575
2             1092
Name: Charlson-Deyo Score, dtype: int64

In [6]:
data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = 0
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = 1
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = 2
data.loc[data['Charlson-Deyo Score'] == '3 or more', 'Charlson-Deyo Score'] = 3
data['Charlson-Deyo Score'].value_counts()

0    26922
1     4201
3     1575
2     1092
Name: Charlson-Deyo Score, dtype: int64

In [7]:
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(20)


Unnamed: 0,0
Reason for No Surgery,0.970761
Lymphovascular Invasion,0.825806
Post-op Length of Stay,0.798875
Total Dose,0.627493
Tumor Size,0.269074
"Income, Quartile",0.107073
"No High School Degree, Quartile",0.105564
Distance from Hospital,0.096123
Surgical Margins,0.050222
Chemotherapy,0.047292


In [8]:
data['Reason for No Surgery'].fillna('Not indicated', inplace=True)
data['Post-op Length of Stay'].fillna(0, inplace=True)

In [9]:
data['Surgical Margins'].fillna('No Surgery', inplace=True)

In [10]:
cols = data.columns.to_list()
num_cols = ['Age', 'Distance from Hospital', 'Tumor Size', 'Treatment Started, Days from Dx', 'Post-op Length of Stay', 'Total Dose']
cat_cols = [x for x in cols if x not in num_cols]

In [11]:
data['Post-op Length of Stay'].fillna(0, inplace=True)

In [12]:
cat_cols.remove('Reason for No Surgery')
cat_cols.remove('Surgical Margins')

In [13]:
#set datatype of categorical columns to object
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
for col in cat_cols:
    data[col] = data[col].astype(str)
for x in cat_cols:
    data[x] = cat_imputer.fit_transform(data[x].values.reshape(-1,1))
num_imputer = SimpleImputer(strategy='median')
for x in num_cols:
    data[x] = num_imputer.fit_transform(data[x].values.reshape(-1,1))
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(5)

Unnamed: 0,0
Facility Type,0.0
T Stage,0.0
"No High School Degree, Quartile",0.0
"Income, Quartile",0.0
Medicaid Expansion,0.0


In [14]:
data.head()


Unnamed: 0_level_0,Facility Type,Facility Location,Age,Sex,Race,Hispanic Ethnicity,Insurance Status,Urban/Rural,Distance from Hospital,Charlson-Deyo Score,...,Stage,T Stage,N Stage,M Stage,Pathologic Stage,Facility Volume Quartile,3-yr Mortality,Keratinizing,Basaloid,Cloacogenic
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,1,2.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,Academic/Research Program,West,49.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,11.4,0,...,3,2.0,1.0,0,No surgery,1.0,0.0,Not Keratinizing,Basaloid,Not Cloacogenic
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,Academic/Research Program,Midwest,58.0,Male,White,Non-Hispanic,Medicaid,Urban,14.0,0,...,2,3.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
D62170955-5fc5-434e-8e6a-6a9e139851c4,Academic/Research Program,Midwest,80.0,Female,White,Non-Hispanic,Medicare/Public,Urban,8.4,0,...,0,1.0,0.0,0,No surgery,3.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,Integrated Network Cancer Program,Midwest,42.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,12.7,0,...,0,1.0,0.0,0,No surgery,4.0,0.0,Not Keratinizing,Not Basaloid,Not Cloacogenic


In [15]:
histo = ['Keratinizing', 'Basaloid', 'Cloacogenic']
for x in histo:
    data.loc[data[x].str.contains('Not'), x] = 0
    data.loc[data[x] !=0 , x] = 1

In [16]:
data.loc[data['Lymphovascular Invasion'] == 'No', 'Lymphovascular Invasion'] = 0
data.loc[data['Lymphovascular Invasion'] == 'Yes', 'Lymphovascular Invasion'] = 2
data.loc[data['Lymphovascular Invasion'] == 'nan', 'Lymphovascular Invasion'] = 1
data['Lymphovascular Invasion'].value_counts()

1    27904
0     4818
2     1068
Name: Lymphovascular Invasion, dtype: int64

In [17]:
no_encode = ['Stage', 'Facility Volume Quartile', 'APR', 'Keratinizing', 'Basaloid', 'Cloacogenic', 'Lymphovascular Invasion', 'Charlson-Deyo Score', 'Year of Diagnosis']
cat_cols.append('Reason for No Surgery')
cat_cols.append('Surgical Margins')
#remove no_encode from cat_cols
for x in no_encode:
    try:
        cat_cols.remove(x)
    except ValueError:
        pass
for x in cat_cols:
    le = LabelEncoder()
    le.fit(data[x])
    keys = le.classes_
    values = le.transform(le.classes_)
    label_dict = dict(zip(keys, values))
    print('col:', x)
    print(label_dict)
    data[x] = le.transform(data[x])


col: Facility Type
{'Academic/Research Program': 0, 'Community Cancer Program': 1, 'Comprehensive Community Cancer Program': 2, 'Integrated Network Cancer Program': 3, 'nan': 4}
col: Facility Location
{'Midwest': 0, 'NE': 1, 'South': 2, 'West': 3, 'nan': 4}
col: Sex
{'Female': 0, 'Male': 1}
col: Race
{'Asian': 0, 'Black': 1, 'Other': 2, 'White': 3, 'nan': 4}
col: Hispanic Ethnicity
{'Hispanic': 0, 'Non-Hispanic': 1, 'nan': 2}
col: Insurance Status
{'Medicaid': 0, 'Medicare/Public': 1, 'Not Insured': 2, 'Private Insurance or Managed Care': 3, 'nan': 4}
col: Urban/Rural
{'Rural': 0, 'Suburban': 1, 'Urban': 2, 'nan': 3}
col: Grade
{'Cell type not determined, not stated or not applicable': 0, 'Grade I, Well differentiated': 1, 'Grade II, Moderately differentiated': 2, 'Grade III, Poorly differentiated': 3, 'Grade IV, Undifferentiated or anaplastic': 4}
col: Post-op Readmission
{'No surgery/readmission': 0, 'Readmission': 1, 'nan': 2}
col: Chemotherapy
{'Multi-agent': 0, 'None': 1, 'Single-

In [18]:
data.head()


Unnamed: 0_level_0,Facility Type,Facility Location,Age,Sex,Race,Hispanic Ethnicity,Insurance Status,Urban/Rural,Distance from Hospital,Charlson-Deyo Score,...,Stage,T Stage,N Stage,M Stage,Pathologic Stage,Facility Volume Quartile,3-yr Mortality,Keratinizing,Basaloid,Cloacogenic
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,0,0,52.0,0,3,1,2,2,3.6,0,...,1,2,0,0,3,3.0,0,0,0,0
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,0,3,49.0,0,3,1,3,2,11.4,0,...,3,2,1,0,3,1.0,0,0,1,0
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,0,0,58.0,1,3,1,0,2,14.0,0,...,2,3,0,0,3,3.0,0,0,0,0
D62170955-5fc5-434e-8e6a-6a9e139851c4,0,0,80.0,0,3,1,1,2,8.4,0,...,0,1,0,0,3,3.0,0,0,0,0
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,3,0,42.0,0,3,1,3,2,12.7,0,...,0,1,0,0,3,4.0,0,0,0,0


In [19]:
#apply the standard scaler to the numerical columns
scaler = StandardScaler()
for x in num_cols:
    data[x] = scaler.fit_transform(data[x].values.reshape(-1,1))


In [20]:
data['Year of Diagnosis'].value_counts()


2016    3177
2014    3054
2015    2975
2013    2899
2012    2634
2011    2458
2017    2405
2009    2307
2010    2276
2008    2145
2007    1870
2006    1672
2005    1566
2004    1562
2018     790
Name: Year of Diagnosis, dtype: int64

In [21]:
pd.crosstab(data['Year of Diagnosis'], data['3-yr Mortality'], normalize='index')

3-yr Mortality,0,1
Year of Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,0.731754,0.268246
2005,0.737548,0.262452
2006,0.76555,0.23445
2007,0.76631,0.23369
2008,0.769231,0.230769
2009,0.767664,0.232336
2010,0.77768,0.22232
2011,0.785598,0.214402
2012,0.786257,0.213743
2013,0.773025,0.226975


In [22]:
print(data.shape[0] * 0.8)
print(data.shape[0] * 0.2)

27032.0
6758.0


In [23]:
data['3-yr Mortality'].value_counts(normalize=True)

0    0.763599
1    0.236401
Name: 3-yr Mortality, dtype: float64

In [24]:
data.to_csv('data/data_os_clean_imputed.csv')

In [25]:
data['Year of Diagnosis'] = data['Year of Diagnosis'].astype(int)

In [26]:
test = data[data['Year of Diagnosis'] > 2014]
print(test.shape)
train = data[data['Year of Diagnosis'] <= 2014]
print(train.shape)


(9347, 35)
(24443, 35)


In [27]:
# train_10 = train[train['Year of Diagnosis'] > 2010]

In [28]:
# test_17 = test[test['Year of Diagnosis'] < 2017]

In [29]:
# test.drop('Year of Diagnosis', axis=1, inplace=True)
# test_17.drop('Year of Diagnosis', axis=1, inplace=True)
# train.drop('Year of Diagnosis', axis=1, inplace=True)
# train_10.drop('Year of Diagnosis', axis=1, inplace=True)

In [30]:
print(test.shape[0] / data.shape[0])

0.27662030186445696


In [31]:
test.to_csv('data/test_os.csv', index=False)
# test_17.to_csv('data/test_17_os.csv', index=False)
train.to_csv('data/train_os.csv', index=False)
# train_10.to_csv('data/train_10_os.csv', index=False)

In [32]:
test['Year of Diagnosis'].value_counts()

2016    3177
2015    2975
2017    2405
2018     790
Name: Year of Diagnosis, dtype: int64

In [34]:
train['3-yr Mortality'].value_counts(normalize=True)

0    0.772573
1    0.227427
Name: 3-yr Mortality, dtype: float64

In [33]:
test['3-yr Mortality'].value_counts(normalize=True)

0    0.740131
1    0.259869
Name: 3-yr Mortality, dtype: float64

In [32]:
train['Year of Diagnosis'].value_counts()

2014    3054
2013    2899
2012    2634
2011    2458
2009    2307
2010    2276
2008    2145
2007    1870
2006    1672
2005    1566
2004    1562
Name: Year of Diagnosis, dtype: int64

In [30]:
data.iloc[:, 0:15].head()

Unnamed: 0_level_0,Facility Type,Facility Location,Age,Sex,Race,Hispanic Ethnicity,Insurance Status,Urban/Rural,Distance from Hospital,Charlson-Deyo Score,Year of Diagnosis,Grade,Tumor Size,Lymphovascular Invasion,Surgical Margins
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,0,0,-0.707231,0,3,1,2,2,-0.678043,0,2016,2,-0.038291,0,1
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,0,3,-0.955281,0,3,1,3,2,-0.173059,0,2004,2,0.246424,1,1
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,0,0,-0.211132,1,3,1,0,2,-0.004731,0,2014,2,2.239425,0,1
D62170955-5fc5-434e-8e6a-6a9e139851c4,0,0,1.607898,0,3,1,1,2,-0.367283,0,2017,3,-0.655172,1,0
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,3,0,-1.534063,0,3,1,3,2,-0.088895,0,2014,3,-0.892434,0,2


In [31]:
data.iloc[:, 15:30].head()

Unnamed: 0_level_0,Post-op Length of Stay,Post-op Readmission,Reason for No Surgery,"Treatment Started, Days from Dx",Chemotherapy,"No High School Degree, Quartile","Income, Quartile",Medicaid Expansion,Total Dose,APR performed,Stage,T Stage,N Stage,M Stage,Pathologic Stage
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,-0.101252,0,1,1.659776,0,3,2,2,-2.347018,0,1,2,0,0,3
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,-0.101252,0,1,-0.967383,0,1,1,0,0.3404,0,3,2,1,0,3
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,-0.101252,0,1,-0.370301,0,2,3,0,0.845635,0,2,3,0,0,3
D62170955-5fc5-434e-8e6a-6a9e139851c4,-0.101252,0,1,-0.410107,0,4,4,0,0.458646,0,0,1,0,0,3
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,-0.101252,0,1,0.027753,0,2,3,0,0.3404,0,0,1,0,0,3


In [32]:
data.iloc[:, 30:45].head()

Unnamed: 0_level_0,Facility Volume Quartile,3-yr Mortality,Keratinizing,Basaloid,Cloacogenic
PUF_CASE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D66f458f9-bdc0-4c95-94a3-a5f8c5bb74f9,3.0,0,0,0,0
Dfc68ea23-71aa-4b0d-87c0-587b4ce13d04,1.0,0,0,1,0
D54a1d0da-4328-4ca5-8481-87cd73b33dd4,3.0,0,0,0,0
D62170955-5fc5-434e-8e6a-6a9e139851c4,3.0,0,0,0,0
Dbfa7903c-bd06-4e1c-9d22-a3d39ed96aee,4.0,0,0,0,0


In [33]:
data['3-yr Mortality'].value_counts(normalize=True)

0    0.763599
1    0.236401
Name: 3-yr Mortality, dtype: float64

In [34]:
data['3-yr Mortality'].value_counts()

0    25802
1     7988
Name: 3-yr Mortality, dtype: int64