In [1]:
#initial imports
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import numpy as np 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor 
import numpy as np

In [2]:
#read in incidence csv from github

incidence_df = pd.read_csv("https://raw.githubusercontent.com/robyndook/Cancer_Treatment_Centers_California/main/Resources/Age_Incidence_Cancer_DB.csv")                         
#get rid of commas in columns
incidence_df = incidence_df.replace(',','', regex=True)
incidence_df.head()     

Unnamed: 0,average_annual_count,recent_trend,primary_state_name,cancer,race_ethnicity,sex,age,age_adjusted_rate,five_year_trend_rates,county_state,...,popestimate2016,popestimate2017,popestimate2018,popestimate2019,abbrv,median_household_income_2015,median_household_income_2016,median_household_income_2017,median_household_income_2018,median_household_income_2019
0,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,167.8,2.3,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
1,72,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,50+,1738.4,2.0,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
2,37,rising,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<65,339.4,2.4,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
3,44,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,65+,2416.0,1.8,Lowndes County AL,...,10248,10097,9974,9726,AL,31117.0,32011.0,33130.0,33973.0,33930.0
4,9,stable,Alabama,All Cancer Sites,All Races (includes Hispanic),All Sexes,<50,145.2,2.0,Conecuh County AL,...,12500,12431,12292,12067,AL,29981.0,29758.0,30796.0,32613.0,41539.0


In [3]:
#check dtypes
incidence_df.dtypes

average_annual_count                     int64
recent_trend                            object
primary_state_name                      object
cancer                                  object
race_ethnicity                          object
sex                                     object
age                                     object
age_adjusted_rate                       object
five_year_trend_rates                   object
county_state                            object
index_of_medical_underservice_score    float64
popestimate2015                          int64
popestimate2016                          int64
popestimate2017                          int64
popestimate2018                          int64
popestimate2019                          int64
abbrv                                   object
median_household_income_2015           float64
median_household_income_2016           float64
median_household_income_2017           float64
median_household_income_2018           float64
median_househ

In [3]:
#get count of rows
incidence_df.count()

average_annual_count                   10229
recent_trend                           10229
primary_state_name                     10229
cancer                                 10229
race_ethnicity                         10229
sex                                    10229
age                                    10229
age_adjusted_rate                      10229
five_year_trend_rates                  10229
county_state                           10229
index_of_medical_underservice_score     8793
popestimate2015                        10229
popestimate2016                        10229
popestimate2017                        10229
popestimate2018                        10229
popestimate2019                        10229
abbrv                                  10210
median_household_income_2015           10210
median_household_income_2016           10210
median_household_income_2017           10210
median_household_income_2018           10210
median_household_income_2019           10210
dtype: int

In [4]:
#get count of values for age
incidence_df["age"].value_counts()

50+    2643
65+    2628
<65    2604
<50    2354
Name: age, dtype: int64

In [5]:
#get count of NA values for MUA
incidence_df["index_of_medical_underservice_score"].isna().sum()

1436

In [6]:
#drop rows where MUA index is null
incidence_df = incidence_df[incidence_df["index_of_medical_underservice_score"].isna() == False]

In [7]:
#get count of rows
incidence_df.count()

average_annual_count                   8793
recent_trend                           8793
primary_state_name                     8793
cancer                                 8793
race_ethnicity                         8793
sex                                    8793
age                                    8793
age_adjusted_rate                      8793
five_year_trend_rates                  8793
county_state                           8793
index_of_medical_underservice_score    8793
popestimate2015                        8793
popestimate2016                        8793
popestimate2017                        8793
popestimate2018                        8793
popestimate2019                        8793
abbrv                                  8786
median_household_income_2015           8786
median_household_income_2016           8786
median_household_income_2017           8786
median_household_income_2018           8786
median_household_income_2019           8786
dtype: int64

In [8]:
#keep only columns with 50+ and <50
incidence_df_50 = incidence_df[incidence_df["age"] != "65+"] 
incidence_df_50 = incidence_df_50[incidence_df_50["age"] != "<65"] 
incidence_df_50["age"].value_counts()

50+    2274
<50    2017
Name: age, dtype: int64

In [9]:
incidence_df_50.count()

average_annual_count                   4291
recent_trend                           4291
primary_state_name                     4291
cancer                                 4291
race_ethnicity                         4291
sex                                    4291
age                                    4291
age_adjusted_rate                      4291
five_year_trend_rates                  4291
county_state                           4291
index_of_medical_underservice_score    4291
popestimate2015                        4291
popestimate2016                        4291
popestimate2017                        4291
popestimate2018                        4291
popestimate2019                        4291
abbrv                                  4288
median_household_income_2015           4288
median_household_income_2016           4288
median_household_income_2017           4288
median_household_income_2018           4288
median_household_income_2019           4288
dtype: int64

In [10]:
#create target 
y = incidence_df_50[['age_adjusted_rate']]

#change dtype to numeric
y['age_adjusted_rate'] = y['age_adjusted_rate'].astype(float)
y.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,age_adjusted_rate
0,167.8
1,1738.4
4,145.2
5,1261.5
8,144.7


In [11]:
#create feautures variable holding ages a d mua score
X = incidence_df_50[["age", "index_of_medical_underservice_score"]]
X.head()

Unnamed: 0,age,index_of_medical_underservice_score
0,<50,54.0
1,50+,54.0
4,<50,51.0
5,50+,51.0
8,<50,56.0


In [12]:
#create dummy variables for age
X = pd.get_dummies(X, columns=["age"])
X.head()

Unnamed: 0,index_of_medical_underservice_score,age_50+,age_<50
0,54.0,0,1
1,54.0,1,0
4,51.0,0,1
5,51.0,1,0
8,56.0,0,1


In [13]:
#get shape of target array
y.shape

(4291, 1)

In [14]:
#get shape of feautures array
X.shape

(4291, 3)

In [15]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
#create linear regression model using sklearn
model = LinearRegression()

In [17]:
#fit the model with training data
regression_age_MUA_incidence = model.fit(X_train, y_train)

In [18]:
#Create predictions array using linear regression
y_pred = regression_age_MUA_incidence.predict(X_test)
y_pred

array([[1358.27043083],
       [1358.27043083],
       [1358.49772996],
       ...,
       [1358.72502909],
       [ 110.59863759],
       [ 113.09892803]])

In [19]:
#get R2 value from linear regression
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9496929189002173

In [20]:
#run regression again with statsmodels
#use statsmodels to add constant to X
#add constant to X
X1 = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [21]:
#fit linear regression model
model = sm.OLS(y, X1).fit()

In [22]:
#view model summary with statsmodels
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      age_adjusted_rate   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.956
Method:                 Least Squares   F-statistic:                 4.680e+04
Date:                Thu, 02 Jun 2022   Prob (F-statistic):               0.00
Time:                        11:08:27   Log-Likelihood:                -27081.
No. Observations:                4291   AIC:                         5.417e+04
Df Residuals:                    4288   BIC:                         5.419e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

In [24]:
#fit the model with training data
#decision tree regression
regressor_DT = DecisionTreeRegressor()
regressor_DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [25]:
#predict
y_pred = regressor_DT.predict(X_test)
y_pred

array([1362.14237288,  111.7877551 , 1371.28383838, ...,  111.52432432,
       1330.48314607,  111.52432432])

In [26]:
#calculate r2 score
r2_score(y_test, y_pred)

0.9450276976839047