In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in incidence csv from github

mortality_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Age_Mortality_Cancer_DB.csv")                         
#get rid of commas in columns
mortality_df = mortality_df.replace(',','', regex=True)
mortality_df.head()     

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,popestimate2016,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,4,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,38.8,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
1,48,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,606.5,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
2,18,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,84.7,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
3,35,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,963.0,Marengo County AL,54.0,19766,19525,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0
4,9,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,30.2,Jackson County AL,53.0,52195,51988,51828,51621,51626,AL,40201.0,41407.0,42658.0,41929.0,44322.0


In [3]:
#check dtypes
mortality_df.dtypes

average_annual_count                     int64
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
abbrv                                   object
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_household_income_2019           float64
dtype: object

In [4]:
#get count of rows
mortality_df.count()

average_annual_count                   8170
primary_state_name                     8170
cancer                                 8170
race_ethnicity                         8170
sex                                    8170
age                                    8170
age_adjusted_rate                      8170
county_state                           8170
index_of_medical_underservice_score    7091
popestimate2015                        8170
popestimate2016                        8170
popestimate2017                        8170
popestimate2018                        8170
popestimate2019                        8170
abbrv                                  8152
median_household_income_2015           8152
median_household_income_2016           8152
median_household_income_2017           8152
median_household_income_2018           8152
median_household_income_2019           8152
dtype: int64

In [5]:
#get count of values for age
mortality_df["age"].value_counts()

50+    2420
65+    2406
<65    2198
<50    1146
Name: age, dtype: int64

In [7]:
#create column for 5-year average median income 
mortality_df["5_year_avg_income"] = (mortality_df["median_household_income_2015"]+mortality_df["median_household_income_2016"]+mortality_df["median_household_income_2017"]+mortality_df["median_household_income_2018"]+mortality_df["median_household_income_2019"])/5
mortality_df.head()

Unnamed: 0,average_annual_count,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,county_state,index_of_medical_underservice_score,popestimate2015,...,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019,5_year_avg_income
0,4,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,38.8,Marengo County AL,54.0,19766,...,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0,36844.6
1,48,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,606.5,Marengo County AL,54.0,19766,...,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0,36844.6
2,18,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,84.7,Marengo County AL,54.0,19766,...,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0,36844.6
3,35,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,963.0,Marengo County AL,54.0,19766,...,19396,19056,18863,AL,35389.0,34794.0,37469.0,37733.0,38838.0,36844.6
4,9,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,30.2,Jackson County AL,53.0,52195,...,51828,51621,51626,AL,40201.0,41407.0,42658.0,41929.0,44322.0,42103.4


In [8]:
#drop rows where avg income is null
mortality_df = mortality_df[mortality_df["5_year_avg_income"].isna() == False]

In [9]:
#get count of rows
mortality_df.count()

average_annual_count                   8152
primary_state_name                     8152
cancer                                 8152
race_ethnicity                         8152
sex                                    8152
age                                    8152
age_adjusted_rate                      8152
county_state                           8152
index_of_medical_underservice_score    7085
popestimate2015                        8152
popestimate2016                        8152
popestimate2017                        8152
popestimate2018                        8152
popestimate2019                        8152
abbrv                                  8152
median_household_income_2015           8152
median_household_income_2016           8152
median_household_income_2017           8152
median_household_income_2018           8152
median_household_income_2019           8152
5_year_avg_income                      8152
dtype: int64

In [10]:
#keep only columns with 50+ and <50
mortality_df_50 = mortality_df[mortality_df["age"] != "65+"] 
mortality_df_50 = mortality_df_50[mortality_df_50["age"] != "<65"] 
mortality_df_50["age"].value_counts()

50+    2415
<50    1142
Name: age, dtype: int64

In [11]:
mortality_df_50.count()

average_annual_count                   3557
primary_state_name                     3557
cancer                                 3557
race_ethnicity                         3557
sex                                    3557
age                                    3557
age_adjusted_rate                      3557
county_state                           3557
index_of_medical_underservice_score    3090
popestimate2015                        3557
popestimate2016                        3557
popestimate2017                        3557
popestimate2018                        3557
popestimate2019                        3557
abbrv                                  3557
median_household_income_2015           3557
median_household_income_2016           3557
median_household_income_2017           3557
median_household_income_2018           3557
median_household_income_2019           3557
5_year_avg_income                      3557
dtype: int64

In [12]:
#create target 
y = mortality_df_50[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,38.8
1,606.5
4,30.2
5,626.1
8,27.6


In [14]:
#create feautures variable holding ages and income
X = mortality_df_50[["age", "5_year_avg_income"]]
X.head()

Unnamed: 0,age,5_year_avg_income
0,<50,36844.6
1,50+,36844.6
4,<50,42103.4
5,50+,42103.4
8,<50,40439.2


In [15]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["age"])
X.head()

Unnamed: 0,5_year_avg_income,age_50+,age_<50
0,36844.6,0,1
1,36844.6,1,0
4,42103.4,0,1
5,42103.4,1,0
8,40439.2,0,1


In [16]:
#get shape of target array
y.shape

(3557, 1)

In [17]:
#get shape of feautures array
X.shape

(3557, 3)

In [18]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
#create linear regression model using sklearn
model = LinearRegression()

In [20]:
#fit the model with training data
regression_age_inc_mortality = model.fit(X_train, y_train)

In [21]:
#Create predictions array using linear regression
y_pred = regression_age_inc_mortality.predict(X_test)
y_pred

array([[ 4.13444100e+01],
       [ 5.71415279e+02],
       [ 5.59196647e+02],
       [ 5.55122224e+02],
       [ 3.78465272e+01],
       [ 4.98924285e+01],
       [ 5.46802194e+02],
       [ 5.52082637e+02],
       [ 5.39382562e+01],
       [ 5.38672670e+02],
       [-1.05199565e+02],
       [ 5.44090678e+02],
       [-5.91196357e+01],
       [-2.39997647e+01],
       [ 3.24292915e+01],
       [ 2.33820170e+01],
       [ 4.94252456e+01],
       [ 5.84001783e+02],
       [ 5.54063431e+02],
       [ 4.88658202e+02],
       [ 5.60263942e+02],
       [ 5.57618118e+02],
       [ 5.76550041e+02],
       [ 4.61210333e+02],
       [-5.86485885e+01],
       [ 5.52582666e+02],
       [ 2.64374477e+01],
       [ 5.53457523e+02],
       [ 5.62238167e+02],
       [ 4.42425673e+01],
       [ 5.59104293e+02],
       [ 5.66134450e+02],
       [ 5.80301029e+02],
       [ 5.67255457e+02],
       [ 5.73210591e+02],
       [ 5.66897631e+02],
       [ 5.63415978e+02],
       [ 5.87178936e+02],
       [ 5.6

In [22]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9182424769696061

In [23]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [24]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [25]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.927
Model:                            OLS   Adj. R-squared:                  0.927
Method:                 Least Squares   F-statistic:                 2.244e+04
Date:                Thu, 02 Jun 2022   Prob (F-statistic):               0.00
Time:                        18:14:01   Log-Likelihood:                -20207.
No. Observations:                3557   AIC:                         4.042e+04
Df Residuals:                    3554   BIC:                         4.044e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               260.7137      3.11

In [26]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [27]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([ 20.1, 570.3, 380.7, 567.1,  21.9,  17. , 492.2, 576.7,  21.7,
       577.6,  10.8, 488.3,   9.7,  13.9,  19.4,  17.4,  23.1, 683.5,
       581.3, 378. , 548.7, 699.2, 656.1, 489.3,  14.9, 603.4,  21.4,
       604. , 580.3,  16.9, 510. , 588.3, 496.5, 651. , 648.9, 587. ,
       673.8, 609.6, 565.2, 386.9, 529. , 619.8,  17.8, 764.2, 477. ,
        11. , 434.4,  17.1, 549.3, 432.6, 531.3,  14.5, 397. , 521.6,
       705.5, 560.5, 674.3, 753.8, 586.3,  15.3,  16.3,  22.8, 521.4,
        19.7, 607.7, 561.3,  11.5,  16.9, 581.3, 790.4, 526.1, 540.7,
       616.9, 471.4, 654.1, 664.7, 476.9, 680.2, 576.3, 424.4,  20.1,
       429.9, 600.1,  12.4,  14.2, 477.2,  17.8, 372.9, 520.4, 583.4,
       610.8, 612.1,  10.9, 651.6, 811.3,  17.2,  15. , 593.7,  19.4,
        18.3, 673.8,  14.4,  14.7, 576.3, 577.2, 564.1,  16.5, 500.3,
        12.6, 683.4, 530.2,  13.4,  20.5, 688.4, 645.2, 506. ,  16.5,
        19.4, 683.1, 618.9,  43.2, 695.5, 679.7, 472.2, 734.5, 509.9,
       561.1, 366.7,

In [28]:
#calculate r2 score
r2_score(y_test, y_pred)

0.872810759735188