# Challenge

We will bring this all together by adapting a challenge from data.world. In the code section below you have access to a large dataset with demographic and medical data for cancer occurances in regions in the US. Your aim is to build a model that predicts the **target_deathrate** variable. You can find descriptions of all the columns [here](https://data.world/exercises/linear-regression-exercise-1)

This is a holistic challenge:


*   Make sure to clean up your data first, there are some missing elements.
*   Some data should be changed: Add a **state** column which indicates the state a person lives in(you can modify the geography column). Perform a multicollinearity test: Should **state** be included in your model or not?
* If it should be, make sure to transform it since we want to be dealing with numerical data.
* Build a model, then display the residual plot for it. Perform a Bartlett test to determine if your model is acceptable or not.




In [19]:
#Loading the libraries 

import pandas as pd

import numpy as np 

import matplotlib.pyplot as plt

import seaborn as sns 

%matplotlib inline 



In [20]:
df=pd.read_csv("../cancer_reg.csv")

In [21]:
#Preview the dataset 

df.head()

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,binnedinc,medianage,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
0,1397.0,469,164.9,489.8,61898,260131,11.2,499.748204,"(61494.5, 125635]",39.3,...,,41.6,32.9,14.0,81.780529,2.594728,4.821857,1.843479,52.856076,6.118831
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
2,102.0,50,174.7,349.7,49348,21026,14.6,47.560164,"(48021.6, 51046.4]",45.0,...,43.5,34.9,42.1,21.1,90.92219,0.739673,0.465898,2.747358,54.444868,3.729488
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657


In [22]:
df.corr()

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,medianage,medianagemale,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
avganncount,1.0,0.939408,-0.143532,0.073553,0.269145,0.926894,-0.135694,0.082071,-0.024098,-0.124969,...,0.186045,0.202349,-0.173548,-0.093699,-0.136501,0.031376,0.435071,0.209184,-0.106221,-0.034508
avgdeathsperyear,0.939408,1.0,-0.090715,0.06269,0.223207,0.977634,-0.066918,0.063488,-0.024599,-0.148487,...,0.12582,0.160124,-0.131687,-0.027338,-0.187159,0.084607,0.443074,0.215149,-0.160266,-0.07442
target_deathrate,-0.143532,-0.090715,1.0,0.449432,-0.428615,-0.120073,0.429389,-0.022285,0.004375,-0.021929,...,-0.363704,-0.267399,0.404572,0.449358,-0.1774,0.257024,-0.186331,-0.189894,-0.293325,-0.087407
incidencerate,0.073553,0.06269,0.449432,1.0,-0.001036,0.026912,0.009046,0.077283,0.018089,-0.014733,...,0.109278,0.149825,0.046109,0.040812,-0.01451,0.113489,-0.008123,-0.208748,-0.152176,-0.118181
medincome,0.269145,0.223207,-0.428615,-0.001036,1.0,0.235523,-0.788965,0.044003,-0.013288,-0.091663,...,0.788048,0.747294,-0.754822,-0.719756,0.167225,-0.270232,0.425844,0.083635,0.446083,-0.010195
popest2015,0.926894,0.977634,-0.120073,0.026912,0.235523,1.0,-0.065299,0.055722,-0.025219,-0.176608,...,0.13266,0.15865,-0.160066,-0.041469,-0.190095,0.073044,0.464168,0.241468,-0.127979,-0.05774
povertypercent,-0.135694,-0.066918,0.429389,0.009046,-0.788965,-0.065299,1.0,-0.055652,-0.02928,-0.214001,...,-0.761672,-0.6831,0.651162,0.798642,-0.509433,0.51153,-0.157289,0.047096,-0.604953,-0.012283
studypercap,0.082071,0.063488,-0.022285,0.077283,0.044003,0.055722,-0.055652,1.0,-0.02603,-0.036647,...,0.072381,0.100063,-0.051497,-0.055512,0.023291,-0.019761,0.062543,-0.015247,-0.051736,0.010676
medianage,-0.024098,-0.024599,0.004375,0.018089,-0.013288,-0.025219,-0.02928,-0.02603,1.0,0.129119,...,-0.024272,-0.036926,0.04906,-0.003298,0.035009,-0.017173,-0.038424,-0.030277,0.014504,-0.008276
medianagemale,-0.124969,-0.148487,-0.021929,-0.014733,-0.091663,-0.176608,-0.214001,-0.036647,0.129119,1.0,...,-0.167064,-0.208664,0.398967,0.002479,0.398044,-0.242748,-0.238322,-0.266655,0.222278,-0.104105


In [24]:
#Checking for null values 

df.isnull().values.any()

True

In [25]:
#Dropping null values

df.dropna(inplace=True)

In [27]:
#Confirm drop of null values 

df.isnull().values.any()

False

In [29]:
#Check datatypes

df.dtypes

avganncount                float64
avgdeathsperyear             int64
target_deathrate           float64
incidencerate              float64
medincome                    int64
popest2015                   int64
povertypercent             float64
studypercap                float64
binnedinc                   object
medianage                  float64
medianagemale              float64
medianagefemale            float64
geography                   object
percentmarried             float64
pctnohs18_24               float64
pcths18_24                 float64
pctsomecol18_24            float64
pctbachdeg18_24            float64
pcths25_over               float64
pctbachdeg25_over          float64
pctemployed16_over         float64
pctunemployed16_over       float64
pctprivatecoverage         float64
pctprivatecoveragealone    float64
pctempprivcoverage         float64
pctpubliccoverage          float64
pctpubliccoveragealone     float64
pctwhite                   float64
pctblack            

In [31]:
#Check the columns of the table

df.columns

Index(['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'geography',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate'],
      dtype='object')

In [32]:
# Rename the geograpy column to the name of the geograpy column



df.columns = ['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'state',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate']

In [33]:
df.columns

Index(['avganncount', 'avgdeathsperyear', 'target_deathrate', 'incidencerate',
       'medincome', 'popest2015', 'povertypercent', 'studypercap', 'binnedinc',
       'medianage', 'medianagemale', 'medianagefemale', 'state',
       'percentmarried', 'pctnohs18_24', 'pcths18_24', 'pctsomecol18_24',
       'pctbachdeg18_24', 'pcths25_over', 'pctbachdeg25_over',
       'pctemployed16_over', 'pctunemployed16_over', 'pctprivatecoverage',
       'pctprivatecoveragealone', 'pctempprivcoverage', 'pctpubliccoverage',
       'pctpubliccoveragealone', 'pctwhite', 'pctblack', 'pctasian',
       'pctotherrace', 'pctmarriedhouseholds', 'birthrate'],
      dtype='object')

In [34]:
df.head()

Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,binnedinc,medianage,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,"(48021.6, 51046.4]",33.0,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,"(42724.4, 45201]",42.8,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,"(48021.6, 51046.4]",48.3,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657
7,146.0,71,183.6,404.0,40189,20848,17.8,0.0,"(37413.8, 40362.7]",51.7,...,33.1,25.9,50.9,24.1,89.406636,0.305159,1.889077,2.286268,48.967033,5.889179
14,2265.0,901,171.0,440.7,50083,490945,16.3,462.373586,"(48021.6, 51046.4]",37.2,...,50.6,42.5,36.5,21.4,89.038167,1.827041,2.315986,1.033625,48.188377,5.355836


In [41]:
df['binnedinc']=pd.to_numeric(df['binnedinc'], errors='coerce')

In [43]:
#We are done! Now get the correlations 


df.corr()



Unnamed: 0,avganncount,avgdeathsperyear,target_deathrate,incidencerate,medincome,popest2015,povertypercent,studypercap,binnedinc,medianage,...,pctprivatecoveragealone,pctempprivcoverage,pctpubliccoverage,pctpubliccoveragealone,pctwhite,pctblack,pctasian,pctotherrace,pctmarriedhouseholds,birthrate
avganncount,1.0,0.963882,-0.128587,0.010871,0.219067,0.95752,-0.105566,0.07998,,-0.026047,...,0.13307,0.131033,-0.164938,-0.072116,-0.138574,0.020755,0.480718,0.276332,-0.086528,-0.03656
avgdeathsperyear,0.963882,1.0,-0.09651,0.005936,0.189414,0.988023,-0.048667,0.086374,,-0.020042,...,0.088625,0.105198,-0.137201,-0.019233,-0.179069,0.069639,0.486688,0.285097,-0.120332,-0.076831
target_deathrate,-0.128587,-0.09651,1.0,0.376033,-0.377156,-0.109513,0.366102,-0.033589,,-0.011419,...,-0.322015,-0.228443,0.346525,0.388474,-0.158205,0.258829,-0.199772,-0.209761,-0.291756,-0.051311
incidencerate,0.010871,0.005936,0.376033,1.0,0.06414,-0.018021,-0.021349,0.094944,,0.020339,...,0.173349,0.189013,-0.069795,-0.061571,-0.029996,0.102696,0.0763,-0.208808,-0.141188,-0.103238
medincome,0.219067,0.189414,-0.377156,0.06414,1.0,0.177701,-0.796091,0.115285,,-0.003541,...,0.78673,0.757948,-0.764296,-0.733614,0.190445,-0.270956,0.465658,0.078734,0.468958,-0.079201
popest2015,0.95752,0.988023,-0.109513,-0.018021,0.177701,1.0,-0.039724,0.073192,,-0.021496,...,0.08142,0.091063,-0.141193,-0.020699,-0.172378,0.055595,0.479433,0.299631,-0.09791,-0.061795
povertypercent,-0.105566,-0.048667,0.366102,-0.021349,-0.796091,-0.039724,1.0,-0.093313,,-0.016437,...,-0.743404,-0.674968,0.634122,0.79052,-0.529978,0.528894,-0.168474,0.060031,-0.598793,0.024339
studypercap,0.07998,0.086374,-0.033589,0.094944,0.115285,0.073192,-0.093313,1.0,,-0.027249,...,0.142935,0.155486,-0.113575,-0.082975,0.021907,-0.01701,0.098969,0.003875,-0.051405,-0.024016
binnedinc,,,,,,,,,,,...,,,,,,,,,,
medianage,-0.026047,-0.020042,-0.011419,0.020339,-0.003541,-0.021496,-0.016437,-0.027249,,1.0,...,-0.008464,-0.012627,0.034639,-0.006858,0.016302,0.018361,-0.05497,-0.017505,-0.005902,0.004128


In [46]:
#Plotting the correlations between the two data sets


sns.heatmap(df)

ValueError: could not convert string to float: 'Kittitas County, Washington'