In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# to process breast_cancer_diagnosis_desc column
from preprocess_text import MedicalTermsCleaner
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# for preprocessing 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler # chose this because less sensitive to extreme outliers 
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# regression models to test
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor # checking that linear models aren't the best option given over 100 features
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# for putting everything together 
from sklearn.pipeline import Pipeline

# for metrics
from sklearn.metrics import mean_squared_error

In [19]:
pd.set_option('display.max_columns', None)

In [21]:
# load data 
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,metastatic_first_novel_treatment,metastatic_first_novel_treatment_type,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
0,268700,,COMMERCIAL,AR,724,South,West South Central,39,F,,C50912,Malignant neoplasm of unspecified site of left...,C773,,,3924.87,82.63,42.58,11.61,13.03,10.87,11.8,12.29,13.22,13.47,10.07,3.64,51.43,48.57,51.05,16.72,23.57,8.66,3.01,43.99,44483.35,2.21,3.97,8.52,7.08,7.67,13.82,15.14,17.51,11.26,8.9,3.93,12.83,24048.55,72.11,1513.75,87384.33,641.39,27.52,16.55,41.83,28.31,9.21,4.11,13.32,38.78,53.6,5.85,11.82,5.31,92.95,1.73,0.33,0.2,0.03,0.83,3.94,3.03,22.24,19.27,0.42,25.35,8.06,8.11,38.55,39.88,42.75,55.16,65.17,75.98,76.75,76.45,73.67,59.73,45.18,37.43,31.67,33.83,42.35,57.72,67.35,75.92,74.28,79.59,70.84,62.39,41.89,41.46,35.24,31.1,45.5,60.94,68.49,77.69,81.35,76.28,73.32,60.32,51.0,48.37,35.76,42.89,53.21,61.07,66.29,82.78,81.52,79.52,75.61,65.76,53.85,39.56,41.59,49.01,52.28,63.4,67.11,75.78,80.58,76.16,72.91,62.68,49.73,39.8,31.71,41.69,50.22,52.55,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18,191
1,484983,White,,IL,629,Midwest,East North Central,55,F,35.36,C50412,Malig neoplasm of upper-outer quadrant of left...,C773,,,2745.39,51.79,43.54,11.22,12.19,11.45,11.01,11.35,14.39,14.15,9.17,5.05,49.32,50.68,49.48,15.42,26.93,8.17,3.17,41.41,51796.79,3.67,3.86,6.58,5.58,5.38,11.02,13.09,19.56,11.76,11.4,8.11,19.51,28028.04,76.71,1113.35,92026.84,638.6,29.37,10.93,35.26,35.33,12.46,6.04,18.49,36.35,52.51,7.45,9.19,5.21,88.75,6.44,0.53,0.19,0.05,0.61,3.42,2.78,20.16,16.94,0.43,26.26,6.93,9.71,34.85,36.15,39.41,54.63,65.41,73.89,74.07,74.37,70.44,57.37,42.15,33.16,26.88,28.36,40.32,56.85,66.84,75.12,72.18,77.08,67.15,58.34,38.55,38.54,32.33,25.62,43.52,58.5,66.81,75.41,78.43,73.88,70.78,58.7,50.5,46.71,32.36,38.76,51.68,58.41,64.1,78.15,79.7,78.33,72.8,63.84,50.35,34.9,39.05,45.8,48.45,60.77,65.89,74.46,79.18,73.13,70.13,59.68,46.52,35.15,28.43,40.01,44.18,49.3,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71,33
2,277055,,COMMERCIAL,CA,925,West,Pacific,59,F,,1749,"Malignant neoplasm of breast (female), unspeci...",C773,,,38343.18,700.34,36.28,13.27,15.66,13.49,13.45,12.4,11.58,10.47,6.38,3.28,49.99,50.01,48.81,11.9,34.35,4.95,3.8,52.89,78696.87,2.59,1.81,3.16,3.71,3.23,7.4,10.42,16.83,13.45,19.21,18.23,37.44,32818.54,66.82,10825.83,392600.4,1631.64,35.56,16.25,27.55,33.88,13.92,8.39,22.32,43.37,59.47,7.28,13.21,0.44,53.95,6.41,5.83,0.81,0.38,21.35,11.27,46.88,12.83,12.72,4.58,37.07,8.07,7.75,53.14,55.28,64.75,67.38,73.31,79.49,84.01,83.28,79.88,67.84,61.92,55.69,60.56,60.99,65.16,68.01,74.24,78.87,84.65,82.23,81.75,74.25,63.29,56.28,58.32,62.97,67.61,67.39,68.06,80.43,80.98,85.08,82.01,75.16,59.59,53.24,54.54,63.52,63.72,67.45,69.58,82.02,84.83,84.06,77.85,72.5,64.24,55.52,53.54,58.18,65.45,68.51,71.13,80.71,85.64,84.85,78.14,74.03,65.7,59.28,60.34,57.43,60.97,68.5,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58,157
3,320055,Hispanic,MEDICAID,CA,900,West,Pacific,59,F,,C50911,Malignant neoplasm of unsp site of right femal...,C773,,,36054.12,5294.33,36.65,9.76,11.27,17.23,17.44,13.09,12.3,9.41,5.67,3.82,50.51,49.49,33.48,11.3,50.46,4.77,3.44,55.53,69266.69,6.32,2.95,6.81,4.18,4.13,7.84,10.16,14.42,10.48,13.73,18.96,32.69,36053.4,31.5,12949.12,873756.0,1651.15,37.37,22.92,18.24,21.27,23.89,13.69,37.58,41.75,64.39,8.68,21.23,0.01,42.82,12.22,12.7,1.12,0.15,22.14,8.85,45.53,11.9,20.76,14.74,30.71,10.34,3.03,57.88,57.65,60.86,62.77,67.07,68.41,70.69,71.19,72.74,66.41,65.09,60.87,64.3,60.77,63.01,64.37,69.73,68.46,73.62,73.65,75.96,71.83,67.04,59.73,63.27,63.83,67.72,64.39,62.56,69.66,72.21,77.47,76.77,74.5,63.48,57.98,57.69,66.53,61.89,64.86,62.63,70.45,73.09,71.94,72.26,69.56,66.85,58.29,56.08,57.88,63.84,65.03,63.9,68.95,73.58,73.62,72.88,73.84,68.87,66.63,63.75,59.89,59.35,63.34,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46,146
4,190386,,COMMERCIAL,CA,934,West,Pacific,71,F,,1748,Malignant neoplasm of other specified sites of...,C7951,,,13700.37,400.48,41.78,10.03,16.43,12.97,11.29,10.09,11.56,13.28,8.78,5.53,51.99,48.01,48.21,11.16,35.6,5.01,3.09,51.88,82314.64,2.74,1.21,2.56,2.59,4.37,6.65,11.11,16.45,12.73,17.57,22.02,39.59,34754.64,58.81,4692.18,602986.8,1631.84,34.58,13.08,21.22,34.58,19.14,12.0,31.14,48.86,55.2,6.39,14.53,1.46,75.58,3.29,3.13,0.92,0.28,7.25,9.56,34.11,11.96,10.34,5.54,23.48,6.54,8.44,51.08,52.29,58.31,60.43,63.65,67.41,68.21,67.95,67.95,61.73,58.04,53.04,58.12,55.83,58.73,59.53,64.2,64.02,68.84,68.18,67.29,67.08,59.49,54.23,55.19,57.03,63.73,60.69,58.19,65.61,67.79,69.57,71.37,68.4,55.45,49.49,54.15,60.98,58.44,62.06,61.68,70.15,69.49,66.71,68.1,64.41,59.72,52.52,51.8,54.2,59.36,60.44,62.41,67.57,71.29,70.83,69.43,65.56,59.6,54.21,57.7,53.6,55.78,59.45,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08,286


In [23]:
data.shape

(13173, 152)

In [25]:
data.describe()

Unnamed: 0,patient_id,patient_zip3,patient_age,bmi,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
count,13173.0,13173.0,13173.0,4102.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13168.0,13173.0,13168.0,13173.0,13168.0,13168.0,13168.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13168.0,13168.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13173.0,13168.0,13168.0,13173.0,13173.0,13173.0,13140.0,13170.0,13173.0,13173.0,13170.0,13153.0,13173.0,13156.0,13146.0,13114.0,13170.0,13170.0,13169.0,13164.0,13144.0,12993.0,13173.0,13021.0,13173.0,13173.0,13173.0,13173.0,13149.0,13173.0,13167.0,13161.0,13161.0,13145.0,13173.0,13173.0,13173.0,13151.0,13173.0,13157.0,13157.0,13155.0,13157.0,13157.0,13173.0,13173.0,13154.0,13173.0,13157.0,13173.0,13173.0,13173.0,13170.0,13160.0,13164.0,13173.0,13173.0,13173.0,13173.0,13172.0,13142.0,13173.0,13163.0,13152.0,13168.0,13173.0,13173.0,13168.0,13167.0,13173.0,13173.0,13164.0,13127.0,13157.0,13166.0,13166.0,13161.0,13140.0,13173.0
mean,555441.784939,568.530859,59.271313,29.168808,20651.373928,1776.872376,40.542676,11.104988,12.857587,13.297375,12.893557,12.063957,13.458827,12.631076,7.670396,4.021445,50.09431,49.905878,47.645569,12.688142,33.819417,5.846155,3.196401,51.800184,74149.173616,3.286397,2.535715,4.159681,3.943212,4.081337,8.428095,11.586349,16.906539,12.671315,15.806551,16.595873,32.402666,36606.228237,65.895238,7589.209077,337379.1,1230.443146,31.233619,11.93339,27.687036,28.874659,19.263585,12.243182,31.506428,43.30204,61.633658,5.943257,13.186546,1.93152,69.975747,11.440878,5.367018,0.880295,0.139133,5.628344,6.569015,18.144554,13.418155,13.417748,4.401387,27.975049,8.511348,7.097288,38.959688,39.218355,44.483714,54.841429,64.484898,72.307275,75.560733,74.047466,69.473466,58.80862,47.098565,38.919349,34.256369,36.708865,43.766486,55.890057,64.994554,72.397563,74.343185,74.317533,69.491319,60.632386,45.586518,42.954875,36.974767,34.759469,47.52108,57.004955,65.244507,72.620689,75.560679,75.12551,72.271894,61.211773,51.595938,47.010274,37.472639,43.529698,51.631609,56.419806,63.375224,73.64423,77.270195,76.562339,71.43642,61.991704,52.61266,40.281865,40.719157,46.279066,48.901873,59.008388,63.444372,72.434105,76.757869,73.938746,70.144358,61.822501,49.864691,40.152299,36.800997,43.061795,46.074308,52.411982,67.932236,73.074635,77.120929,76.281156,71.417027,59.323286,46.10015,42.096213,96.515221
std,259476.503094,275.758485,13.218883,5.75282,13840.379638,3876.061897,4.031027,1.511534,1.952248,3.390472,2.410845,1.248652,1.671822,2.571409,2.138788,1.257552,1.694808,1.694915,7.528704,2.055966,8.126657,1.556496,0.222907,6.696196,20425.92483,1.430226,1.328466,1.751091,1.402426,1.327044,2.212647,2.576229,2.726104,1.849686,3.139121,8.893866,10.980561,8513.23505,14.491534,4967.148033,253061.1,428.828282,4.772054,5.036011,8.05816,5.072259,6.255266,5.984538,11.801702,4.567257,5.977344,1.92064,3.388123,3.194201,17.941299,12.403575,6.635362,2.31838,0.529791,6.194343,3.539595,16.829748,3.698844,5.105035,4.782378,5.089195,4.169557,3.101826,13.339969,13.701362,12.311727,9.525471,6.098297,6.307366,5.293818,5.951861,6.894506,8.166959,11.252402,14.38355,17.630565,17.617165,14.365601,9.154345,6.697788,6.068137,5.978399,5.859982,7.315835,9.001259,12.318052,11.516327,15.403094,19.205324,14.03528,9.204489,6.306477,6.313503,5.747061,6.168244,6.390686,9.372343,9.506538,10.443744,13.126782,13.53273,9.779115,9.225037,6.424771,6.170639,5.909783,5.246549,6.236389,8.112799,9.176596,13.667811,12.611809,11.402153,12.622846,8.165056,7.218949,5.969036,6.064217,6.152335,5.909376,8.173483,12.007462,14.337058,14.943633,14.423773,11.974793,11.0699,6.30174,6.58163,5.335091,5.282156,6.502875,9.261934,12.245594,11.209011,108.969873
min,100043.0,100.0,18.0,15.0,635.55,0.92,20.6,0.0,6.31,5.93,1.5,0.8,0.0,0.2,0.0,0.0,39.73,38.4,0.9,0.2,13.44,0.0,2.55,19.31,29222.0,0.75,0.36,1.02,1.03,1.1,2.65,1.7,4.95,4.73,4.29,0.84,5.69,4316.0,15.85,0.0,60629.0,448.4,17.79,0.0,0.0,7.2,2.47,2.09,7.05,23.91,30.7,0.82,2.26,0.0,14.5,0.08,0.0,0.0,0.0,0.0,0.43,0.06,4.6,3.43,0.0,12.46,2.44,1.2,6.79,8.93,14.0,29.3,43.26,56.63,60.11,56.87,48.11,39.81,24.24,-1.12,-2.86,0.39,13.96,32.84,46.65,51.61,57.6,56.56,42.48,34.8,19.0,15.78,9.65,0.39,21.48,38.37,44.95,55.88,58.11,56.37,46.96,41.01,26.88,16.14,9.63,14.55,29.16,35.26,45.33,55.9,60.4,58.12,50.67,37.08,25.95,9.87,10.25,17.49,20.44,38.86,46.06,53.4,58.14,55.43,49.35,38.41,23.17,8.61,5.93,4.1,22.72,28.79,45.88,53.46,58.54,56.2,51.83,37.54,19.15,15.38,0.0
25%,335100.0,330.0,50.0,24.825,9160.34,163.15,37.19,10.15,11.65,11.0,11.27,11.35,12.3,10.62,6.01,3.32,49.15,49.05,42.93,11.16,27.27,4.77,3.03,47.7325,61255.11,2.27,1.54,2.9,2.94,3.17,6.99,9.95,15.37,11.35,13.62,10.07,24.575,31255.74,56.48,3381.31,166811.1,888.92,28.18,8.37,21.64,25.73,13.98,7.53,21.91,40.19,57.96,4.68,10.99,0.04,56.21,2.28,1.05,0.21,0.02,1.31,3.86,4.71,10.33,9.87,0.94,24.87,5.6,4.97,28.04,27.25,33.84,47.13,60.18,68.18,71.75,69.98,64.27,52.56,37.76,27.62,19.24,20.33,32.42,48.1,59.75,68.41,69.35,70.21,63.18,52.64,35.71,33.29,24.26,18.0,34.39,49.53,61.46,68.28,71.01,70.01,67.79,53.78,44.67,39.8,26.62,32.08,43.44,48.85,58.73,69.6,73.36,73.21,66.91,55.55,45.66,28.86,30.35,37.15,37.24,52.39,57.3,68.6,72.78,69.09,65.46,56.13,39.6,28.01,24.1,31.51,36.24,42.9,63.22,68.05,73.17,72.55,67.02,51.97,35.56,33.37,3.0
50%,555769.0,557.0,59.0,28.58,18952.78,700.34,40.64,11.0,12.9,12.53,12.4,12.12,13.57,12.52,7.33,3.82,49.98,50.02,49.43,12.72,32.01,5.55,3.16,52.59,69729.95,2.88,2.2,3.79,3.79,4.04,8.43,11.83,17.08,12.68,15.94,14.65,30.52,35211.02,69.91,6994.41,241157.1,1155.43,30.83,10.75,27.48,29.29,18.87,10.78,29.79,42.99,62.78,5.49,12.73,0.45,70.9,6.41,2.82,0.43,0.05,3.52,5.65,11.98,12.96,12.21,2.75,27.79,7.36,6.99,35.41,36.71,40.59,53.65,63.89,71.18,74.46,72.51,68.27,57.17,43.37,36.49,31.1,34.69,41.96,55.35,64.03,71.41,73.96,73.23,67.59,58.05,41.86,39.63,34.3,33.39,45.21,55.41,64.96,71.14,74.72,74.45,71.18,57.61,48.96,46.32,33.12,39.46,50.11,55.78,61.86,72.58,76.48,76.37,70.89,60.21,49.15,36.82,37.94,44.27,47.79,57.6,62.72,71.21,75.78,72.31,69.37,60.65,46.5,35.9,33.93,42.02,43.24,50.29,66.12,71.64,76.65,76.08,70.88,57.45,42.43,38.5,44.0
75%,780967.0,832.0,67.0,33.0,30021.28,1666.52,42.94,12.14,13.91,14.97,14.13,12.86,14.61,13.99,8.84,4.56,50.95,50.85,52.99,14.19,39.29,6.61,3.33,56.16,83230.64,4.08,3.14,5.09,4.64,4.84,9.92,13.34,18.53,13.96,18.42,21.21,38.79,39705.73,77.22,10939.08,405353.4,1478.24,33.63,15.02,33.54,32.37,23.89,15.83,38.72,46.11,65.68,6.81,15.06,2.39,85.75,16.27,7.0,0.78,0.15,7.85,8.85,27.49,15.7,16.41,5.98,30.71,10.39,8.75,49.98,51.18,57.29,62.77,67.93,75.98,78.64,76.42,73.94,65.53,55.37,48.64,49.53,52.51,56.85,63.91,70.16,76.28,77.66,77.52,75.98,69.71,55.53,52.79,49.08,52.72,59.73,64.39,68.8,77.72,79.83,78.64,76.93,69.9,58.49,54.15,49.31,55.27,60.35,64.76,67.13,78.15,81.48,80.29,75.27,69.19,61.0,52.04,52.46,57.34,61.85,65.45,68.57,75.69,80.37,77.56,74.04,67.66,59.89,50.38,47.64,55.1225,58.42,62.22,72.59,77.66,80.29,79.13,75.64,67.36,55.73,50.5,181.0
max,999982.0,995.0,91.0,97.0,71374.13,29851.69,54.57,17.68,35.3,62.1,25.47,17.82,21.66,24.51,19.0,18.83,61.6,60.28,66.9,21.03,98.9,20.65,4.17,65.64,164119.2,19.62,11.87,14.28,12.4,14.35,26.55,24.08,27.13,24.8,27.48,52.82,69.03,88910.5,90.37,25922.55,1853109.0,2965.25,108.6,34.33,53.96,50.13,41.7,51.84,77.82,73.0,78.67,18.8,25.54,25.27,98.44,69.66,49.85,76.93,14.76,33.19,26.43,91.01,35.16,38.35,26.76,48.02,27.57,25.2,72.37,71.0,70.71,76.73,81.45,91.64,96.45,92.33,86.44,80.18,76.61,74.47,70.77,73.25,72.13,76.21,80.57,90.22,95.53,90.17,87.83,82.1,74.56,72.17,70.6,72.17,75.84,79.59,80.9,92.34,92.89,95.26,98.95,82.79,79.13,77.38,71.9,77.7,74.82,76.57,79.61,94.29,95.63,96.09,85.49,79.63,75.55,75.63,71.95,72.4,73.78,80.7,82.13,92.76,106.73,94.48,85.72,79.56,75.31,71.74,73.31,75.04,71.64,76.49,86.57,90.66,96.43,95.77,89.19,81.46,76.3,73.54,365.0


In [37]:
# columns with missing values 
missing = data.isna().sum().reset_index()

In [41]:
missing = missing.rename(columns={0:'count_na'})
missing.head()

Unnamed: 0,index,count_na
0,patient_id,0
1,patient_race,6657
2,payer_type,1765
3,patient_state,0
4,patient_zip3,0


In [43]:
missing_counts = dict()

for idx, row in missing.iterrows():
    feature = row['index']
    if row['count_na'] > 0:
        missing_counts[feature] = row['count_na']

In [45]:
missing_counts

{'patient_race': 6657,
 'payer_type': 1765,
 'bmi': 9071,
 'metastatic_first_novel_treatment': 13162,
 'metastatic_first_novel_treatment_type': 13162,
 'family_size': 5,
 'family_dual_income': 5,
 'income_household_median': 5,
 'income_household_under_5': 5,
 'income_household_5_to_10': 5,
 'income_household_10_to_15': 5,
 'income_household_15_to_20': 5,
 'income_household_20_to_25': 5,
 'income_household_25_to_35': 5,
 'income_household_35_to_50': 5,
 'income_household_50_to_75': 5,
 'income_household_75_to_100': 5,
 'income_household_100_to_150': 5,
 'income_household_150_over': 5,
 'income_household_six_figure': 5,
 'home_ownership': 5,
 'home_value': 5,
 'rent_median': 5,
 'rent_burden': 5,
 'self_employed': 5,
 'farmer': 5,
 'poverty': 5,
 'limited_english': 5,
 'Average of Jan-13': 33,
 'Average of Feb-13': 3,
 'Average of May-13': 3,
 'Average of Jun-13': 20,
 'Average of Aug-13': 17,
 'Average of Sep-13': 27,
 'Average of Oct-13': 59,
 'Average of Nov-13': 3,
 'Average of Dec-1

In [51]:
# get rid of columns where the majority of values are missing 
drop_cols = ['patient_race', 'bmi', 'metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type']

data.drop(columns=drop_cols, axis=1, inplace=True)
data.head()

Unnamed: 0,patient_id,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
0,268700,COMMERCIAL,AR,724,South,West South Central,39,F,C50912,Malignant neoplasm of unspecified site of left...,C773,3924.87,82.63,42.58,11.61,13.03,10.87,11.8,12.29,13.22,13.47,10.07,3.64,51.43,48.57,51.05,16.72,23.57,8.66,3.01,43.99,44483.35,2.21,3.97,8.52,7.08,7.67,13.82,15.14,17.51,11.26,8.9,3.93,12.83,24048.55,72.11,1513.75,87384.33,641.39,27.52,16.55,41.83,28.31,9.21,4.11,13.32,38.78,53.6,5.85,11.82,5.31,92.95,1.73,0.33,0.2,0.03,0.83,3.94,3.03,22.24,19.27,0.42,25.35,8.06,8.11,38.55,39.88,42.75,55.16,65.17,75.98,76.75,76.45,73.67,59.73,45.18,37.43,31.67,33.83,42.35,57.72,67.35,75.92,74.28,79.59,70.84,62.39,41.89,41.46,35.24,31.1,45.5,60.94,68.49,77.69,81.35,76.28,73.32,60.32,51.0,48.37,35.76,42.89,53.21,61.07,66.29,82.78,81.52,79.52,75.61,65.76,53.85,39.56,41.59,49.01,52.28,63.4,67.11,75.78,80.58,76.16,72.91,62.68,49.73,39.8,31.71,41.69,50.22,52.55,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18,191
1,484983,,IL,629,Midwest,East North Central,55,F,C50412,Malig neoplasm of upper-outer quadrant of left...,C773,2745.39,51.79,43.54,11.22,12.19,11.45,11.01,11.35,14.39,14.15,9.17,5.05,49.32,50.68,49.48,15.42,26.93,8.17,3.17,41.41,51796.79,3.67,3.86,6.58,5.58,5.38,11.02,13.09,19.56,11.76,11.4,8.11,19.51,28028.04,76.71,1113.35,92026.84,638.6,29.37,10.93,35.26,35.33,12.46,6.04,18.49,36.35,52.51,7.45,9.19,5.21,88.75,6.44,0.53,0.19,0.05,0.61,3.42,2.78,20.16,16.94,0.43,26.26,6.93,9.71,34.85,36.15,39.41,54.63,65.41,73.89,74.07,74.37,70.44,57.37,42.15,33.16,26.88,28.36,40.32,56.85,66.84,75.12,72.18,77.08,67.15,58.34,38.55,38.54,32.33,25.62,43.52,58.5,66.81,75.41,78.43,73.88,70.78,58.7,50.5,46.71,32.36,38.76,51.68,58.41,64.1,78.15,79.7,78.33,72.8,63.84,50.35,34.9,39.05,45.8,48.45,60.77,65.89,74.46,79.18,73.13,70.13,59.68,46.52,35.15,28.43,40.01,44.18,49.3,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71,33
2,277055,COMMERCIAL,CA,925,West,Pacific,59,F,1749,"Malignant neoplasm of breast (female), unspeci...",C773,38343.18,700.34,36.28,13.27,15.66,13.49,13.45,12.4,11.58,10.47,6.38,3.28,49.99,50.01,48.81,11.9,34.35,4.95,3.8,52.89,78696.87,2.59,1.81,3.16,3.71,3.23,7.4,10.42,16.83,13.45,19.21,18.23,37.44,32818.54,66.82,10825.83,392600.4,1631.64,35.56,16.25,27.55,33.88,13.92,8.39,22.32,43.37,59.47,7.28,13.21,0.44,53.95,6.41,5.83,0.81,0.38,21.35,11.27,46.88,12.83,12.72,4.58,37.07,8.07,7.75,53.14,55.28,64.75,67.38,73.31,79.49,84.01,83.28,79.88,67.84,61.92,55.69,60.56,60.99,65.16,68.01,74.24,78.87,84.65,82.23,81.75,74.25,63.29,56.28,58.32,62.97,67.61,67.39,68.06,80.43,80.98,85.08,82.01,75.16,59.59,53.24,54.54,63.52,63.72,67.45,69.58,82.02,84.83,84.06,77.85,72.5,64.24,55.52,53.54,58.18,65.45,68.51,71.13,80.71,85.64,84.85,78.14,74.03,65.7,59.28,60.34,57.43,60.97,68.5,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58,157
3,320055,MEDICAID,CA,900,West,Pacific,59,F,C50911,Malignant neoplasm of unsp site of right femal...,C773,36054.12,5294.33,36.65,9.76,11.27,17.23,17.44,13.09,12.3,9.41,5.67,3.82,50.51,49.49,33.48,11.3,50.46,4.77,3.44,55.53,69266.69,6.32,2.95,6.81,4.18,4.13,7.84,10.16,14.42,10.48,13.73,18.96,32.69,36053.4,31.5,12949.12,873756.0,1651.15,37.37,22.92,18.24,21.27,23.89,13.69,37.58,41.75,64.39,8.68,21.23,0.01,42.82,12.22,12.7,1.12,0.15,22.14,8.85,45.53,11.9,20.76,14.74,30.71,10.34,3.03,57.88,57.65,60.86,62.77,67.07,68.41,70.69,71.19,72.74,66.41,65.09,60.87,64.3,60.77,63.01,64.37,69.73,68.46,73.62,73.65,75.96,71.83,67.04,59.73,63.27,63.83,67.72,64.39,62.56,69.66,72.21,77.47,76.77,74.5,63.48,57.98,57.69,66.53,61.89,64.86,62.63,70.45,73.09,71.94,72.26,69.56,66.85,58.29,56.08,57.88,63.84,65.03,63.9,68.95,73.58,73.62,72.88,73.84,68.87,66.63,63.75,59.89,59.35,63.34,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46,146
4,190386,COMMERCIAL,CA,934,West,Pacific,71,F,1748,Malignant neoplasm of other specified sites of...,C7951,13700.37,400.48,41.78,10.03,16.43,12.97,11.29,10.09,11.56,13.28,8.78,5.53,51.99,48.01,48.21,11.16,35.6,5.01,3.09,51.88,82314.64,2.74,1.21,2.56,2.59,4.37,6.65,11.11,16.45,12.73,17.57,22.02,39.59,34754.64,58.81,4692.18,602986.8,1631.84,34.58,13.08,21.22,34.58,19.14,12.0,31.14,48.86,55.2,6.39,14.53,1.46,75.58,3.29,3.13,0.92,0.28,7.25,9.56,34.11,11.96,10.34,5.54,23.48,6.54,8.44,51.08,52.29,58.31,60.43,63.65,67.41,68.21,67.95,67.95,61.73,58.04,53.04,58.12,55.83,58.73,59.53,64.2,64.02,68.84,68.18,67.29,67.08,59.49,54.23,55.19,57.03,63.73,60.69,58.19,65.61,67.79,69.57,71.37,68.4,55.45,49.49,54.15,60.98,58.44,62.06,61.68,70.15,69.49,66.71,68.1,64.41,59.72,52.52,51.8,54.2,59.36,60.44,62.41,67.57,71.29,70.83,69.43,65.56,59.6,54.21,57.7,53.6,55.78,59.45,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08,286


In [53]:
# curious if the rows with missing values for demographic data are the same 5 rows 

missing_demo = data[data['family_size'].isna()]
missing_demo

Unnamed: 0,patient_id,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
3985,757384,COMMERCIAL,TX,772,South,West South Central,43,F,C50911,Malignant neoplasm of unsp site of right femal...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,40
4404,367375,COMMERCIAL,TX,772,South,West South Central,61,F,1743,Malignant neoplasm of lower-inner quadrant of ...,C7951,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,36
6397,749144,COMMERCIAL,TX,772,South,West South Central,61,F,C50412,Malig neoplasm of upper-outer quadrant of left...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,0
8028,433905,COMMERCIAL,TX,772,South,West South Central,62,F,C50911,Malignant neoplasm of unsp site of right femal...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,36
12209,377675,COMMERCIAL,TX,772,South,West South Central,71,F,C50912,Malignant neoplasm of unspecified site of left...,C7800,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,47


In [55]:
# interesting, it's all for patient_zip3 == 772 
# are there other patients with zip3 == 772? if so, we can just fill the NaNs for this row with the same info

data_772 = data[data['patient_zip3']==772]
data_772.head()

Unnamed: 0,patient_id,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
3985,757384,COMMERCIAL,TX,772,South,West South Central,43,F,C50911,Malignant neoplasm of unsp site of right femal...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,40
4404,367375,COMMERCIAL,TX,772,South,West South Central,61,F,1743,Malignant neoplasm of lower-inner quadrant of ...,C7951,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,36
6397,749144,COMMERCIAL,TX,772,South,West South Central,61,F,C50412,Malig neoplasm of upper-outer quadrant of left...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,0
8028,433905,COMMERCIAL,TX,772,South,West South Central,62,F,C50911,Malignant neoplasm of unsp site of right femal...,C773,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,36
12209,377675,COMMERCIAL,TX,772,South,West South Central,71,F,C50912,Malignant neoplasm of unspecified site of left...,C7800,4459.0,3376.1,20.6,0.0,35.3,62.1,1.5,0.8,0.0,0.2,0.0,0.0,47.1,52.9,0.9,0.2,98.9,0.0,,,,,,,,,,,,,,,,4316.0,,0.0,,,,0.0,0.0,44.8,41.7,13.5,55.2,73.0,30.7,18.8,,,47.6,23.0,20.1,0.0,0.0,1.7,7.6,18.2,4.6,,,16.2,4.5,1.6,52.36,56.4,59.25,66.05,73.87,82.56,82.95,84.99,81.4,70.52,56.23,49.51,47.31,53.35,57.7,67.66,73.92,84.1,82.81,83.57,79.69,71.54,56.36,55.84,49.87,51.02,59.94,71.24,74.85,80.84,85.18,84.81,80.69,73.54,62.15,58.43,51.56,58.81,64.94,69.85,75.39,82.04,87.24,83.8,82.73,75.83,67.04,56.38,58.15,65.85,67.7,71.45,75.51,81.86,85.63,83.71,79.54,70.17,65.8,51.79,46.29,57.91,66.5,65.66,79.57,84.9,85.55,85.06,80.93,71.28,56.46,53.85,47


In [59]:
data_772.shape

(5, 148)

In [61]:
# however, 772 is for Houston, and 770 is also for Houston, so maybe we can use this zip3 to fill in the NAs instead?

data_770 = data[data['patient_zip3']==770]
data_770.shape

(121, 148)

In [63]:
data_770.head()

Unnamed: 0,patient_id,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,metastatic_cancer_diagnosis_code,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period
24,752174,MEDICARE ADVANTAGE,TX,770,South,West South Central,77,F,C50919,Malignant neoplasm of unsp site of unspecified...,C773,33353.72,1836.73,34.71,13.51,13.54,15.52,15.84,12.8,11.74,9.7,5.05,2.28,49.98,50.02,42.93,12.65,39.89,4.53,3.39,51.24,66076.27,4.08,3.09,4.35,4.47,4.71,10.18,12.72,17.11,10.93,12.79,15.56,28.36,37965.46,49.7,12096.22,247842.7,1193.59,32.92,21.3,22.5,23.68,19.41,13.1,32.51,47.45,66.17,6.71,17.26,0.27,47.42,22.5,7.07,0.52,0.04,11.34,11.12,44.48,9.89,18.73,13.97,27.93,22.54,3.77,53.62,57.57,60.04,65.86,74.01,82.82,83.19,84.39,81.61,71.29,57.74,50.59,48.54,54.11,58.25,68.19,73.46,83.03,82.81,83.96,79.68,72.08,57.28,56.25,49.87,51.89,61.35,71.9,76.12,81.3,85.22,84.29,79.9,73.49,63.61,59.46,51.92,58.94,65.89,69.98,75.39,82.04,86.22,83.8,82.57,75.58,67.2,58.23,59.44,65.98,67.99,71.59,75.64,81.66,85.04,83.61,79.6,71.53,66.63,53.12,47.43,61.48,66.91,66.5,79.54,84.57,84.85,85.09,80.61,72.46,58.14,55.77,118
82,465345,COMMERCIAL,TX,770,South,West South Central,64,F,1749,"Malignant neoplasm of breast (female), unspeci...",C7951,33353.72,1836.73,34.71,13.51,13.54,15.52,15.84,12.8,11.74,9.7,5.05,2.28,49.98,50.02,42.93,12.65,39.89,4.53,3.39,51.24,66076.27,4.08,3.09,4.35,4.47,4.71,10.18,12.72,17.11,10.93,12.79,15.56,28.36,37965.46,49.7,12096.22,247842.7,1193.59,32.92,21.3,22.5,23.68,19.41,13.1,32.51,47.45,66.17,6.71,17.26,0.27,47.42,22.5,7.07,0.52,0.04,11.34,11.12,44.48,9.89,18.73,13.97,27.93,22.54,3.77,53.62,57.57,60.04,65.86,74.01,82.82,83.19,84.39,81.61,71.29,57.74,50.59,48.54,54.11,58.25,68.19,73.46,83.03,82.81,83.96,79.68,72.08,57.28,56.25,49.87,51.89,61.35,71.9,76.12,81.3,85.22,84.29,79.9,73.49,63.61,59.46,51.92,58.94,65.89,69.98,75.39,82.04,86.22,83.8,82.57,75.58,67.2,58.23,59.44,65.98,67.99,71.59,75.64,81.66,85.04,83.61,79.6,71.53,66.63,53.12,47.43,61.48,66.91,66.5,79.54,84.57,84.85,85.09,80.61,72.46,58.14,55.77,81
270,263002,COMMERCIAL,TX,770,South,West South Central,66,F,C50112,Malignant neoplasm of central portion of left ...,C773,33353.72,1836.73,34.71,13.51,13.54,15.52,15.84,12.8,11.74,9.7,5.05,2.28,49.98,50.02,42.93,12.65,39.89,4.53,3.39,51.24,66076.27,4.08,3.09,4.35,4.47,4.71,10.18,12.72,17.11,10.93,12.79,15.56,28.36,37965.46,49.7,12096.22,247842.7,1193.59,32.92,21.3,22.5,23.68,19.41,13.1,32.51,47.45,66.17,6.71,17.26,0.27,47.42,22.5,7.07,0.52,0.04,11.34,11.12,44.48,9.89,18.73,13.97,27.93,22.54,3.77,53.62,57.57,60.04,65.86,74.01,82.82,83.19,84.39,81.61,71.29,57.74,50.59,48.54,54.11,58.25,68.19,73.46,83.03,82.81,83.96,79.68,72.08,57.28,56.25,49.87,51.89,61.35,71.9,76.12,81.3,85.22,84.29,79.9,73.49,63.61,59.46,51.92,58.94,65.89,69.98,75.39,82.04,86.22,83.8,82.57,75.58,67.2,58.23,59.44,65.98,67.99,71.59,75.64,81.66,85.04,83.61,79.6,71.53,66.63,53.12,47.43,61.48,66.91,66.5,79.54,84.57,84.85,85.09,80.61,72.46,58.14,55.77,303
376,964266,MEDICAID,TX,770,South,West South Central,56,F,C50811,Malignant neoplasm of ovrlp sites of right fem...,C799,33353.72,1836.73,34.71,13.51,13.54,15.52,15.84,12.8,11.74,9.7,5.05,2.28,49.98,50.02,42.93,12.65,39.89,4.53,3.39,51.24,66076.27,4.08,3.09,4.35,4.47,4.71,10.18,12.72,17.11,10.93,12.79,15.56,28.36,37965.46,49.7,12096.22,247842.7,1193.59,32.92,21.3,22.5,23.68,19.41,13.1,32.51,47.45,66.17,6.71,17.26,0.27,47.42,22.5,7.07,0.52,0.04,11.34,11.12,44.48,9.89,18.73,13.97,27.93,22.54,3.77,53.62,57.57,60.04,65.86,74.01,82.82,83.19,84.39,81.61,71.29,57.74,50.59,48.54,54.11,58.25,68.19,73.46,83.03,82.81,83.96,79.68,72.08,57.28,56.25,49.87,51.89,61.35,71.9,76.12,81.3,85.22,84.29,79.9,73.49,63.61,59.46,51.92,58.94,65.89,69.98,75.39,82.04,86.22,83.8,82.57,75.58,67.2,58.23,59.44,65.98,67.99,71.59,75.64,81.66,85.04,83.61,79.6,71.53,66.63,53.12,47.43,61.48,66.91,66.5,79.54,84.57,84.85,85.09,80.61,72.46,58.14,55.77,315
474,238793,MEDICARE ADVANTAGE,TX,770,South,West South Central,91,F,C50911,Malignant neoplasm of unsp site of right femal...,C779,33353.72,1836.73,34.71,13.51,13.54,15.52,15.84,12.8,11.74,9.7,5.05,2.28,49.98,50.02,42.93,12.65,39.89,4.53,3.39,51.24,66076.27,4.08,3.09,4.35,4.47,4.71,10.18,12.72,17.11,10.93,12.79,15.56,28.36,37965.46,49.7,12096.22,247842.7,1193.59,32.92,21.3,22.5,23.68,19.41,13.1,32.51,47.45,66.17,6.71,17.26,0.27,47.42,22.5,7.07,0.52,0.04,11.34,11.12,44.48,9.89,18.73,13.97,27.93,22.54,3.77,53.62,57.57,60.04,65.86,74.01,82.82,83.19,84.39,81.61,71.29,57.74,50.59,48.54,54.11,58.25,68.19,73.46,83.03,82.81,83.96,79.68,72.08,57.28,56.25,49.87,51.89,61.35,71.9,76.12,81.3,85.22,84.29,79.9,73.49,63.61,59.46,51.92,58.94,65.89,69.98,75.39,82.04,86.22,83.8,82.57,75.58,67.2,58.23,59.44,65.98,67.99,71.59,75.64,81.66,85.04,83.61,79.6,71.53,66.63,53.12,47.43,61.48,66.91,66.5,79.54,84.57,84.85,85.09,80.61,72.46,58.14,55.77,0


In [67]:
# ok this is a possibility 
# there's also just this one zip3 that is weird *in this particular regard* 
# to be continued 

### Visualizations with Yellowbrick: Feature Correlations 

In [71]:
from yellowbrick.target import FeatureCorrelation

In [73]:
# as a simple example, let's look at temperatures vs. metastatic_diagnosis_period
# for later, can also add a column for the delta for each month between 2013 and 2018 
# this might be more useful, e.g., will let you know which areas are getting warmer (and potentially at a faster rate)

# first, figure out the relevant column indices
data.shape

(13173, 148)

In [83]:
# we have 72 columns of temperature data
# the last column (147) should be the metastatic_diagnosis_period 

data_last2 = data.iloc[:, 146:148] # slice is not inclusive of the end index, good to know 
data_last2.head()

Unnamed: 0,Average of Dec-18,metastatic_diagnosis_period
0,41.18,191
1,39.71,33
2,55.58,157
3,59.46,146
4,54.08,286


In [87]:
data_first2 = data.iloc[:, 75:77] # expect this to be Jan-13 and Feb-13
data_first2.head()

Unnamed: 0,Average of Jan-13,Average of Feb-13
0,38.55,39.88
1,34.85,36.15
2,53.14,55.28
3,57.88,57.65
4,51.08,52.29


In [89]:
data_temps = data.iloc[:, 75:147]
data_temps.head()

Unnamed: 0,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
0,38.55,39.88,42.75,55.16,65.17,75.98,76.75,76.45,73.67,59.73,45.18,37.43,31.67,33.83,42.35,57.72,67.35,75.92,74.28,79.59,70.84,62.39,41.89,41.46,35.24,31.1,45.5,60.94,68.49,77.69,81.35,76.28,73.32,60.32,51.0,48.37,35.76,42.89,53.21,61.07,66.29,82.78,81.52,79.52,75.61,65.76,53.85,39.56,41.59,49.01,52.28,63.4,67.11,75.78,80.58,76.16,72.91,62.68,49.73,39.8,31.71,41.69,50.22,52.55,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18
1,34.85,36.15,39.41,54.63,65.41,73.89,74.07,74.37,70.44,57.37,42.15,33.16,26.88,28.36,40.32,56.85,66.84,75.12,72.18,77.08,67.15,58.34,38.55,38.54,32.33,25.62,43.52,58.5,66.81,75.41,78.43,73.88,70.78,58.7,50.5,46.71,32.36,38.76,51.68,58.41,64.1,78.15,79.7,78.33,72.8,63.84,50.35,34.9,39.05,45.8,48.45,60.77,65.89,74.46,79.18,73.13,70.13,59.68,46.52,35.15,28.43,40.01,44.18,49.3,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71
2,53.14,55.28,64.75,67.38,73.31,79.49,84.01,83.28,79.88,67.84,61.92,55.69,60.56,60.99,65.16,68.01,74.24,78.87,84.65,82.23,81.75,74.25,63.29,56.28,58.32,62.97,67.61,67.39,68.06,80.43,80.98,85.08,82.01,75.16,59.59,53.24,54.54,63.52,63.72,67.45,69.58,82.02,84.83,84.06,77.85,72.5,64.24,55.52,53.54,58.18,65.45,68.51,71.13,80.71,85.64,84.85,78.14,74.03,65.7,59.28,60.34,57.43,60.97,68.5,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58
3,57.88,57.65,60.86,62.77,67.07,68.41,70.69,71.19,72.74,66.41,65.09,60.87,64.3,60.77,63.01,64.37,69.73,68.46,73.62,73.65,75.96,71.83,67.04,59.73,63.27,63.83,67.72,64.39,62.56,69.66,72.21,77.47,76.77,74.5,63.48,57.98,57.69,66.53,61.89,64.86,62.63,70.45,73.09,71.94,72.26,69.56,66.85,58.29,56.08,57.88,63.84,65.03,63.9,68.95,73.58,73.62,72.88,73.84,68.87,66.63,63.75,59.89,59.35,63.34,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46
4,51.08,52.29,58.31,60.43,63.65,67.41,68.21,67.95,67.95,61.73,58.04,53.04,58.12,55.83,58.73,59.53,64.2,64.02,68.84,68.18,67.29,67.08,59.49,54.23,55.19,57.03,63.73,60.69,58.19,65.61,67.79,69.57,71.37,68.4,55.45,49.49,54.15,60.98,58.44,62.06,61.68,70.15,69.49,66.71,68.1,64.41,59.72,52.52,51.8,54.2,59.36,60.44,62.41,67.57,71.29,70.83,69.43,65.56,59.6,54.21,57.7,53.6,55.78,59.45,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08


In [91]:
data_temps.shape

(13173, 72)

In [93]:
y = data['metastatic_diagnosis_period']
y.head()

0    191
1     33
2    157
3    146
4    286
Name: metastatic_diagnosis_period, dtype: int64

In [97]:
labels_temps = np.array(data_temps.columns)

In [103]:
# yellowbrick doesn't like missing values 

data_temps_clean = data.iloc[:, 75:]
data_temps_clean.shape

(13173, 73)

In [105]:
data_temps_clean.dropna(axis=0, inplace=True)
data_temps_clean.shape

(12767, 73)

In [111]:
X = data_temps_clean.iloc[:, :72]
X.shape

(12767, 72)

In [113]:
X.head()

Unnamed: 0,Average of Jan-13,Average of Feb-13,Average of Mar-13,Average of Apr-13,Average of May-13,Average of Jun-13,Average of Jul-13,Average of Aug-13,Average of Sep-13,Average of Oct-13,Average of Nov-13,Average of Dec-13,Average of Jan-14,Average of Feb-14,Average of Mar-14,Average of Apr-14,Average of May-14,Average of Jun-14,Average of Jul-14,Average of Aug-14,Average of Sep-14,Average of Oct-14,Average of Nov-14,Average of Dec-14,Average of Jan-15,Average of Feb-15,Average of Mar-15,Average of Apr-15,Average of May-15,Average of Jun-15,Average of Jul-15,Average of Aug-15,Average of Sep-15,Average of Oct-15,Average of Nov-15,Average of Dec-15,Average of Jan-16,Average of Feb-16,Average of Mar-16,Average of Apr-16,Average of May-16,Average of Jun-16,Average of Jul-16,Average of Aug-16,Average of Sep-16,Average of Oct-16,Average of Nov-16,Average of Dec-16,Average of Jan-17,Average of Feb-17,Average of Mar-17,Average of Apr-17,Average of May-17,Average of Jun-17,Average of Jul-17,Average of Aug-17,Average of Sep-17,Average of Oct-17,Average of Nov-17,Average of Dec-17,Average of Jan-18,Average of Feb-18,Average of Mar-18,Average of Apr-18,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18
0,38.55,39.88,42.75,55.16,65.17,75.98,76.75,76.45,73.67,59.73,45.18,37.43,31.67,33.83,42.35,57.72,67.35,75.92,74.28,79.59,70.84,62.39,41.89,41.46,35.24,31.1,45.5,60.94,68.49,77.69,81.35,76.28,73.32,60.32,51.0,48.37,35.76,42.89,53.21,61.07,66.29,82.78,81.52,79.52,75.61,65.76,53.85,39.56,41.59,49.01,52.28,63.4,67.11,75.78,80.58,76.16,72.91,62.68,49.73,39.8,31.71,41.69,50.22,52.55,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18
1,34.85,36.15,39.41,54.63,65.41,73.89,74.07,74.37,70.44,57.37,42.15,33.16,26.88,28.36,40.32,56.85,66.84,75.12,72.18,77.08,67.15,58.34,38.55,38.54,32.33,25.62,43.52,58.5,66.81,75.41,78.43,73.88,70.78,58.7,50.5,46.71,32.36,38.76,51.68,58.41,64.1,78.15,79.7,78.33,72.8,63.84,50.35,34.9,39.05,45.8,48.45,60.77,65.89,74.46,79.18,73.13,70.13,59.68,46.52,35.15,28.43,40.01,44.18,49.3,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71
2,53.14,55.28,64.75,67.38,73.31,79.49,84.01,83.28,79.88,67.84,61.92,55.69,60.56,60.99,65.16,68.01,74.24,78.87,84.65,82.23,81.75,74.25,63.29,56.28,58.32,62.97,67.61,67.39,68.06,80.43,80.98,85.08,82.01,75.16,59.59,53.24,54.54,63.52,63.72,67.45,69.58,82.02,84.83,84.06,77.85,72.5,64.24,55.52,53.54,58.18,65.45,68.51,71.13,80.71,85.64,84.85,78.14,74.03,65.7,59.28,60.34,57.43,60.97,68.5,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58
3,57.88,57.65,60.86,62.77,67.07,68.41,70.69,71.19,72.74,66.41,65.09,60.87,64.3,60.77,63.01,64.37,69.73,68.46,73.62,73.65,75.96,71.83,67.04,59.73,63.27,63.83,67.72,64.39,62.56,69.66,72.21,77.47,76.77,74.5,63.48,57.98,57.69,66.53,61.89,64.86,62.63,70.45,73.09,71.94,72.26,69.56,66.85,58.29,56.08,57.88,63.84,65.03,63.9,68.95,73.58,73.62,72.88,73.84,68.87,66.63,63.75,59.89,59.35,63.34,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46
4,51.08,52.29,58.31,60.43,63.65,67.41,68.21,67.95,67.95,61.73,58.04,53.04,58.12,55.83,58.73,59.53,64.2,64.02,68.84,68.18,67.29,67.08,59.49,54.23,55.19,57.03,63.73,60.69,58.19,65.61,67.79,69.57,71.37,68.4,55.45,49.49,54.15,60.98,58.44,62.06,61.68,70.15,69.49,66.71,68.1,64.41,59.72,52.52,51.8,54.2,59.36,60.44,62.41,67.57,71.29,70.83,69.43,65.56,59.6,54.21,57.7,53.6,55.78,59.45,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08


In [119]:
y = data_temps_clean.iloc[:, -1:]
y.head()

Unnamed: 0,metastatic_diagnosis_period
0,191
1,33
2,157
3,146
4,286


In [121]:
# yellowbrick visualizer 

visualizer = FeatureCorrelation(labels=labels_temps)

visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.show() # Finalize and render the figure

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''