In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
import smogn

from sklearn.ensemble import RandomForestRegressor
SEED=42

In [2]:
# funtion for evaluating model performance
def evaluate_model(model, y_test, y_pred, y_train, X_train, X_test):
    print('MAE: ', mean_absolute_error(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred)) 
    print('Training set score: {:.4f}'.format(model.score(X_train, y_train)))
    print('Test set score: {:.4f}'.format(model.score(X_test, y_test)))

In [3]:
df = pd.read_excel('output.xlsx', header=0).dropna().reset_index(drop=True)

In [4]:
print(df["HISTOLOGIC_TYPE_ICD_O_3"].unique())
print(df["PRIMARY_SITE"].unique())
print(df["ER_STATUS"].unique())
print(df["PR_STATUS"].unique())
print(df["HER2_STATUS"].unique())
print(df["MEDIAN_INCOME_dollars"].unique())
print(df["GRADE"].unique())
print(df["STAGE"].unique())
print(df["SURGERY_PERFORMED"].unique())
print(df["BONE_METASTASES"].unique())
print(df["LIVER_METASTASES"].unique())
print(df["BRAIN_METASTASES"].unique())

[8500 8541 8522 8523 8520 8480 8575 8507 8524 8050 8530 8504 8343 8032
 8540 8501 8401 8510 8503 8323 8140 8521 8200 8074 8070 8033 8201 8000
 8022 8255 8010 8211 8246 8315 8490 8041 8982 8983 8013 8543 8230 8004
 8574 8020 8572 8123 8021 8260 8513 8571 8071 8570 8310 8560 8550 8720]
['C50.8-Overlapping lesion of breast' 'C50.1-Central portion of breast'
 'C50.4-Upper-outer quadrant of breast' 'C50.9-Breast, NOS'
 'C50.2-Upper-inner quadrant of breast' 'C50.6-Axillary tail of breast'
 'C50.5-Lower-outer quadrant of breast'
 'C50.3-Lower-inner quadrant of breast' 'C50.0-Nipple']
['Negative' 'Positive' 'Borderline']
['Negative' 'Positive' 'Borderline']
['Negative' 'Positive' 'Borderline']
['$75,000+' '$70,000 - $74,999' '$60,000 - $64,999' '$65,000 - $69,999'
 '$40,000 - $44,999' '$45,000 - $49,999' '$55,000 - $59,999'
 '$50,000 - $54,999' '$35,000 - $39,999' '< $35,000'
 'Unknown/missing/no match/Not 1990-2018']
['Poorly differentiated; Grade III' 'Moderately differentiated; Grade II'
 

In [5]:
from sklearn.preprocessing import LabelEncoder

le_hist = LabelEncoder()
df['HISTOLOGIC_TYPE_ICD_O_3'] = le_hist.fit_transform(df['HISTOLOGIC_TYPE_ICD_O_3'])

le_site = LabelEncoder()
df['PRIMARY_SITE'] = le_site.fit_transform(df['PRIMARY_SITE'])

le_ER = LabelEncoder()
df['ER_STATUS'] = le_ER.fit_transform(df['ER_STATUS'])

le_PR = LabelEncoder()
df['PR_STATUS'] = le_PR.fit_transform(df['PR_STATUS'])

le_HER2 = LabelEncoder()
df['HER2_STATUS'] = le_HER2.fit_transform(df['HER2_STATUS'])

le_inc = LabelEncoder()
df['MEDIAN_INCOME_dollars'] = le_inc.fit_transform(df['MEDIAN_INCOME_dollars'])

le_gr = LabelEncoder()
df['GRADE'] = le_gr.fit_transform(df['GRADE'])

le_st = LabelEncoder()
df['STAGE'] = le_st.fit_transform(df['STAGE'])

le_surg = LabelEncoder()
df['SURGERY_PERFORMED'] = le_surg.fit_transform(df['SURGERY_PERFORMED'])

le_bone = LabelEncoder()
df['BONE_METASTASES'] = le_bone.fit_transform(df['BONE_METASTASES'])

le_liver = LabelEncoder()
df['LIVER_METASTASES'] = le_liver.fit_transform(df['LIVER_METASTASES'])

le_brain = LabelEncoder()
df['BRAIN_METASTASES'] = le_brain.fit_transform(df['BRAIN_METASTASES'])

cols = df.columns

In [6]:
# SCALING: AGE_AT_DIAGNOSIS; HISTOLOGIC_TYPE_ICD_O_3; PRIMARY_SITE; TOTAL_NUM_OF_MALIGNANT_TUMORS; TUMOR_SIZE_mm
sc_age = StandardScaler()
df['AGE_AT_DIAGNOSIS_years'] = sc_age.fit_transform(df['AGE_AT_DIAGNOSIS_years'].values.reshape(-1,1))

sc_size = StandardScaler()
df['TUMOR_SIZE_mm'] = sc_size.fit_transform(df['TUMOR_SIZE_mm'].values.reshape(-1,1))

sc_mal = StandardScaler()
df['TOTAL_NUM_OF_MALIGNANT_TUMORS'] = sc_mal.fit_transform(df['TOTAL_NUM_OF_MALIGNANT_TUMORS'].values.reshape(-1,1))

sc_exa = StandardScaler()
df['REGIONAL_NODES_EXAMINED'] = sc_exa.fit_transform(df['REGIONAL_NODES_EXAMINED'].values.reshape(-1,1))

sc_pos = StandardScaler()
df['REGIONAL_NODES_POSITIVE'] = sc_pos.fit_transform(df['REGIONAL_NODES_POSITIVE'].values.reshape(-1,1))

In [7]:
df = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,16]]

In [8]:
df

Unnamed: 0,AGE_AT_DIAGNOSIS_years,HISTOLOGIC_TYPE_ICD_O_3,ER_STATUS,PR_STATUS,HER2_STATUS,PRIMARY_SITE,SURGERY_PERFORMED,MEDIAN_INCOME_dollars,GRADE,STAGE,TOTAL_NUM_OF_MALIGNANT_TUMORS,REGIONAL_NODES_EXAMINED,REGIONAL_NODES_POSITIVE,BONE_METASTASES,LIVER_METASTASES,BRAIN_METASTASES,TUMOR_SIZE_mm,SURVIVAL_TIME_months
0,-0.324283,30,1,1,1,7,0,8,1,1,0.959197,-1.076489,-0.655549,0,0,0,0.307511,16
1,1.472151,44,2,2,1,7,0,8,0,2,0.959197,0.131749,0.324868,0,0,0,0.804574,31
2,0.573934,39,2,2,1,7,0,8,0,0,2.492864,-0.834841,-0.655549,0,0,0,-0.189553,45
3,0.781215,30,2,2,1,1,0,8,0,1,2.492864,-0.955665,-0.655549,0,0,0,-0.738938,68
4,1.610338,40,2,2,1,7,0,8,1,1,0.959197,-0.593194,-0.655549,0,0,0,-0.503487,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17535,-2.258904,0,2,2,1,8,0,2,1,2,-0.574469,0.252573,-0.492146,0,0,0,0.255188,55
17536,1.195777,30,2,2,1,2,0,1,4,0,2.492864,-1.076489,-0.655549,2,0,0,-0.398842,18
17537,-0.047909,30,1,1,1,4,0,0,1,1,-0.574469,-0.714018,-0.655549,0,0,0,-0.529648,59
17538,1.264870,30,2,2,1,3,0,8,1,2,-0.574469,0.131749,-0.165340,0,0,0,-0.451165,69


In [9]:
rg_mtrx = [
    [0,1,0],
    [2,0,0],
    [60,0.8,0],
    [100,0.95,0],
    [150,1,0]
]

df_smogn = smogn.smoter(
    
    ## main arguments
    data = df,           ## pandas dataframe
    y = 'SURVIVAL_TIME_months',          ## string ('header name')
    k = 8,                    ## positive integer (k < n)
    pert = 0.4,              ## real number (0 < R < 1)
    samp_method = 'extreme',  ## string ('balance' or 'extreme')
    drop_na_col = True,       ## boolean (True or False)
    drop_na_row = True,       ## boolean (True or False)
    replace = False,          ## boolean (True or False)

    ## phi relevance arguments
    rel_thres = 0.6,         ## real number (0 < R < 1)
    rel_method = 'manual',    ## string ('auto' or 'manual')
    # rel_xtrm_type = 'both', ## unused (rel_method = 'manual')
    # rel_coef = 1.50,        ## unused (rel_method = 'manual')
    rel_ctrl_pts_rg = rg_mtrx ## 2d array (format: [x, y])
)

  b_index.update({i: y_sort[bumps[i]:bumps[i + 1]]})
dist_matrix: 100%|##########| 36/36 [00:00<00:00, 94.01it/s]
synth_matrix: 100%|##########| 36/36 [00:46<00:00,  1.28s/it]
r_index: 100%|##########| 32/32 [00:00<00:00, 290.48it/s]
  data_new.iloc[:, j] = data_new.iloc[:, j].astype(feat_dtypes_orig[j])


In [10]:
y = df_smogn['SURVIVAL_TIME_months']
X = df_smogn.drop(['SURVIVAL_TIME_months'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

In [11]:
RFR_model = RandomForestRegressor(n_estimators = 200, min_samples_split = 5, min_samples_leaf = 1, max_features = 0.5, max_depth = None, bootstrap = False, random_state = SEED)
RFR_model.fit(X_train, y_train)
y_pred = RFR_model.predict(X_test)
evaluate_model(RFR_model, y_test, y_pred, y_train, X_train, X_test)

MAE:  4.18853294041352
MSE:  80.88674482648955
Training set score: 0.9948
Test set score: 0.8704


In [12]:
import pickle
import bz2file as bz2

def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2','wb') as file:
        pickle.dump(data, file)

data = {"model": RFR_model, "sc_age": sc_age, "sc_size": sc_size, "le_hist": le_hist, "le_site": le_site, "sc_mal": sc_mal, "le_ER": le_ER, "le_PR": le_PR, "le_HER2": le_HER2, "le_inc": le_inc, "le_gr": le_gr, "le_st": le_st, "le_surg": le_surg, "le_bone": le_bone, "le_liver": le_liver, "le_brain": le_brain, "sc_exa": sc_exa, "sc_pos": sc_pos}
compressed_pickle('seer_steps', data)

def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

data = decompress_pickle('seer_steps.pbz2')

regr_loaded = data["model"]
le_hist = data["le_hist"]
le_site = data["le_site"]
le_ER = data["le_ER"]
le_PR = data["le_PR"]
le_HER2 = data["le_HER2"]
le_inc = data["le_inc"]
le_gr = data["le_gr"]
le_st = data["le_st"]
le_surg = data["le_surg"]
le_bone = data["le_bone"]
le_liver = data["le_liver"]
le_brain = data["le_brain"]
sc_age = data["sc_age"]
sc_size = data["sc_size"]
sc_mal = data["sc_mal"]
sc_exa = data["sc_exa"]
sc_pos = data["sc_pos"]

y_pred = regr_loaded.predict(X)
y_pred

array([16.24041667, 38.42333333, 43.82708333, ...,  0.        ,
        0.        ,  0.        ])