In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import pickle

In [2]:
df = pd.read_csv('cancer_ml6.csv', index_col=['Geography'])

In [3]:
df.head()

Unnamed: 0_level_0,avgAnnCount,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,MedianAgeFemale,...,PctEmpPrivCoverage_log,PctPublicCoverageAlone_log,BirthRate_sqrd,INTPTLONG_sqrd,onc_min_distsl2_sqrd,city_min_distsl1_log,PCT_LACCESS_POP10_sqrd,PCT_LACCESS_HHNV10_sqrd,PC_DIRSALES07_sqrd,PCT_OBESE_ADULTS13_log
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abbeville County, South Carolina",143.0,183.7,430.9,35525,24932,21.4,0.0,43.3,40.7,44.9,...,3.627004,3.254243,33.916455,6798.671681,37.990317,0.881278,901.00586,36.601854,13.9129,3.456317
"Acadia Parish, Louisiana",323.0,230.5,492.7,40269,62577,22.0,0.0,35.7,34.7,37.2,...,3.660994,3.139833,48.144664,8539.799759,9.259066,1.927334,2.894705,3.229274,58.9824,3.499533
"Accomack County, Virginia",221.0,216.2,479.4,38390,32973,19.4,0.0,45.3,42.7,47.3,...,3.618993,2.912351,18.920319,5739.245321,3.04026,1.547278,21.215684,59.388869,2.9584,3.303217
"Ada County, Idaho",1757.0,151.6,469.0,57908,434211,11.6,414.545002,35.8,35.0,36.6,...,3.981549,2.406945,22.882023,13512.75495,565.675639,2.326541,335.644721,0.336674,5.2441,3.387774
"Adair County, Iowa",51.0,178.9,440.7,48216,7228,10.3,138.350858,45.9,45.0,47.7,...,3.770459,2.653242,24.891658,8926.123473,11.302137,2.001201,62.037931,3.52072,48.3025,3.443618


In [4]:
df.columns

Index(['avgAnnCount', 'TARGET_deathRate', 'incidenceRate', 'medIncome',
       'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale',
       ...
       'PctEmpPrivCoverage_log', 'PctPublicCoverageAlone_log',
       'BirthRate_sqrd', 'INTPTLONG_sqrd', 'onc_min_distsl2_sqrd',
       'city_min_distsl1_log', 'PCT_LACCESS_POP10_sqrd',
       'PCT_LACCESS_HHNV10_sqrd', 'PC_DIRSALES07_sqrd',
       'PCT_OBESE_ADULTS13_log'],
      dtype='object', length=250)

In [5]:
len(df.index.unique())

3047

In [6]:
df.shape

(3047, 250)

In [7]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3047 entries, Abbeville County, South Carolina to Zavala County, Texas
Data columns (total 250 columns):
avgAnnCount                       float64
TARGET_deathRate                  float64
incidenceRate                     float64
medIncome                         int64
popEst2015                        int64
povertyPercent                    float64
studyPerCap                       float64
MedianAge                         float64
MedianAgeMale                     float64
MedianAgeFemale                   float64
AvgHouseholdSize                  float64
PercentMarried                    float64
PctNoHS18_24                      float64
PctHS18_24                        float64
PctSomeCol18_24                   float64
PctBachDeg18_24                   float64
PctHS25_Over                      float64
PctBachDeg25_Over                 float64
PctEmployed16_Over                float64
PctUnemployed16_Over              float64
PctPrivateCove

In [8]:
lr = linear_model.LinearRegression()
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [9]:
y = df['TARGET_deathRate']

In [10]:
target_name = ['TARGET_deathRate']
X = df[[cn for cn in df.columns if cn not in target_name]]

In [11]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
lr.score(X, y)

0.6201550922287257

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [15]:
lr.score(X_train, y_train)

0.617863041290833

In [16]:
y_pred = lr.predict(X_test)
y_pred[0:35]

array([181.84635798, 173.40860156, 162.01745513, 174.72161957,
       172.67542944, 190.97351859, 181.79930301, 160.60380088,
       169.51255889, 176.87085812, 181.98332179, 205.27353373,
       159.2721136 , 155.44943569, 219.49547146, 118.66454644,
       193.10246756, 203.09106432, 201.01861605, 188.6615422 ,
       169.55718896, 150.36235791, 185.06049798, 161.90710709,
       166.08653812, 185.06290631, 173.42361325, 161.46306453,
       150.66828992, 169.16704792, 220.60185612, 186.99026417,
       152.21279829, 179.34842636, 170.80135937])

score we care about 

In [20]:
lr.score(X_test, y_test)

0.6053656794304265

In [18]:
#crossvalscore
cv_scores_train = cross_val_score(lr, X_train, y_train, cv=5)
cv_scores_train

array([0.4743376 , 0.55834286, 0.52709064, 0.42484742, 0.41944285])

In [19]:
cv_scores_test = cross_val_score(lr, X_test, y_test, cv=5)
cv_scores_test

array([-0.15685264,  0.10523607,  0.37793025, -2.09356902,  0.39421308])

In [None]:
#hyperparameters for linear regression: ridge and LASSO?

In [None]:
#SGDRegressor(loss='huber')

In [None]:
#deep learning

In [None]:
#random forest?

In [None]:
#GBM?

In [None]:
#coefficients from the linear regression model and SGD, possibly?

In [None]:
#nodes with heaviest heights for deep learning?