In [1]:
# import dependencies
import pandas as pd
import numpy as np
import os

In [2]:
# Pull in original cancer data csvs from CDC and convert to dataframes
# CSV pulled from https://wonder.cdc.gov/
lung_df1 = pd.read_csv(os.path.join("Resources", "CDC_Cancer1.csv"))
lung_df2 = pd.read_csv(os.path.join("Resources", "CDC_Cancer2.csv"))
county_df = pd.read_csv(os.path.join("Resources", "CDC_Cancer_by_County.csv"))

# Combine lung dataframes into single dataframe
lung_df = lung_df1.append(lung_df2, ignore_index=True)

# Delete unnecessary columns
lung_df = lung_df.drop(["Notes", "State Code", "Year Code", "Sex Code", "Race Code", "Age Group Code"], axis=1)
county_df = county_df.drop(["Notes", "County Code", "Year Code"], axis =1)

# Export cancer cleaned cancer dataframes to csv files

lung_df.to_csv(os.path.join('Output','Cancer_by_State_Cleaned.csv'))
county_df.to_csv(os.path.join('Output','Cancer_by_County_Cleaned.csv'))

In [3]:
# Pull in original poverty CSV from FRED
# CSV pulled from https://geofred.stlouisfed.org/map/?th=pubugn&cc=5&rc=false&im=fractile&sb&lng=-49.6&lat=45.5&zm=2&sl&sv&am=Average&at=Not%20Seasonally%20Adjusted,%20Annual,%20Percent&dt=2015-01-01&fq=Annual&rt=county&sti=150203&un=lin
povData = pd.read_csv(os.path.join('Resources','Poverty%ByUSCounty.csv'))

# Convert imported data into a dataframe
povDF = pd.DataFrame(povData, columns = ['County', 'State', '2012', '2013', '2014', '2015', '2016', '2017'])


In [4]:
# create a dictionary for each of the state abbreviations so this can be merged with CDC data flawlessly
abbrevToFullName = {'AL': 'Alabama', 'MT': 'Montana' , 'AK': 'Alaska', 'NE': 'Nebraska',
                    'AZ': 'Arizona', 'NV': 'Nevada','AR': 'Arkansas', 'NH': 'New Hampshire',
                    'CA': 'California', 'NJ': 'New Jersey', 'CO': 'Colorado', 'NM': 'New Mexico',
                    'CT': 'Connecticut', 'NY': 'New York', 'DE': 'Delaware', 'NC': 'North Carolina',
                    'FL': 'Florida', 'ND': 'North Dakota', 'GA': 'Georgia', 'OH': 'Ohio',
                    'HI': 'Hawaii', 'OK': 'Oklahoma', 'ID': 'Idaho', 'OR': 'Oregon', 'IL': 'Illinois',
                    'PA': 'Pennsylvania', 'IN': 'Indiana', 'RI': 'Rhode Island', 'IA': 'Iowa', 'SC': 'South Carolina',
                    'KS': 'Kansas', 'SD': 'South Dakota', 'KY': 'Kentucky', 'TN': 'Tennessee', 'LA': 'Louisiana',
                    'TX': 'Texas', 'ME': 'Maine', 'UT': 'Utah', 'MD': 'Maryland', 'VT': 'Vermont', 
                    'MA': 'Massachusetts', 'VA': 'Virginia', 'MI': 'Michigan', 'WA': 'Washington',
                    'MN': 'Minnesota', 'WV': 'West Virginia', 'MS': 'Mississippi', 'WI': 'Wisconsin', 
                    'MO': 'Missouri', 'WY': 'Wyoming', 'DC': 'Washington D.C.'}

# add a space in front of each key in the dictionary, original CSV has a SPACE before each abbreviation
abbrevToFullNameWithSpaces = {(" " + abbrev):full for abbrev, full in abbrevToFullName.items()}

# show adjusted dictionary with spaces added
print(abbrevToFullNameWithSpaces)

{' AL': 'Alabama', ' MT': 'Montana', ' AK': 'Alaska', ' NE': 'Nebraska', ' AZ': 'Arizona', ' NV': 'Nevada', ' AR': 'Arkansas', ' NH': 'New Hampshire', ' CA': 'California', ' NJ': 'New Jersey', ' CO': 'Colorado', ' NM': 'New Mexico', ' CT': 'Connecticut', ' NY': 'New York', ' DE': 'Delaware', ' NC': 'North Carolina', ' FL': 'Florida', ' ND': 'North Dakota', ' GA': 'Georgia', ' OH': 'Ohio', ' HI': 'Hawaii', ' OK': 'Oklahoma', ' ID': 'Idaho', ' OR': 'Oregon', ' IL': 'Illinois', ' PA': 'Pennsylvania', ' IN': 'Indiana', ' RI': 'Rhode Island', ' IA': 'Iowa', ' SC': 'South Carolina', ' KS': 'Kansas', ' SD': 'South Dakota', ' KY': 'Kentucky', ' TN': 'Tennessee', ' LA': 'Louisiana', ' TX': 'Texas', ' ME': 'Maine', ' UT': 'Utah', ' MD': 'Maryland', ' VT': 'Vermont', ' MA': 'Massachusetts', ' VA': 'Virginia', ' MI': 'Michigan', ' WA': 'Washington', ' MN': 'Minnesota', ' WV': 'West Virginia', ' MS': 'Mississippi', ' WI': 'Wisconsin', ' MO': 'Missouri', ' WY': 'Wyoming', ' DC': 'Washington D.C.'}


In [5]:
# map full state name into the CSV by pairing full state name from dictionary with the state abbreviation
povDF['Full State Name'] = povDF['State'].map(abbrevToFullNameWithSpaces)
# preview dataframe with fullstate added
povDF.head()

Unnamed: 0,County,State,2012,2013,2014,2015,2016,2017,Full State Name
0,"Aleutians East Borough, AK",AK,16.5,16.7,16.4,16.8,15.5,16.7,Alaska
1,"Aleutians West CA, AK",AK,10.5,9.2,8.9,9.1,7.7,7.5,Alaska
2,"Anchorage Muny, AK",AK,7.7,7.9,8.3,8.2,8.1,8.1,Alaska
3,"Bethel CA, AK",AK,21.8,22.8,23.7,25.2,26.6,27.0,Alaska
4,"Bristol Bay Borough, AK",AK,6.2,7.9,7.2,7.2,7.3,7.1,Alaska


In [6]:
# set up what we want to new order to be in the revised dataframe
newOrder = ['Full State Name', 'State', 'County', '2012', '2013', '2014', '2015', '2016', '2017']
# reorder columns in new data frame
povertyRates = povDF.reindex(columns=newOrder)
# preview final dataFrame
povertyRates.head()

Unnamed: 0,Full State Name,State,County,2012,2013,2014,2015,2016,2017
0,Alaska,AK,"Aleutians East Borough, AK",16.5,16.7,16.4,16.8,15.5,16.7
1,Alaska,AK,"Aleutians West CA, AK",10.5,9.2,8.9,9.1,7.7,7.5
2,Alaska,AK,"Anchorage Muny, AK",7.7,7.9,8.3,8.2,8.1,8.1
3,Alaska,AK,"Bethel CA, AK",21.8,22.8,23.7,25.2,26.6,27.0
4,Alaska,AK,"Bristol Bay Borough, AK",6.2,7.9,7.2,7.2,7.3,7.1


In [7]:
groupedPovDF = povertyRates.groupby('Full State Name')
medianPovRate2012 = groupedPovDF['2012'].median()
medianPovRate2013 = groupedPovDF['2013'].median()
medianPovRate2014 = groupedPovDF['2014'].median()
medianPovRate2015 = groupedPovDF['2015'].median()
medianPovRate2016 = groupedPovDF['2016'].median()
medianPovRate2017 = groupedPovDF['2017'].median()

groupMedianDF = pd.DataFrame({'2012': medianPovRate2012,
                              '2013': medianPovRate2013,
                              '2014': medianPovRate2014, 
                              '2015': medianPovRate2015,
                              '2016': medianPovRate2016, 
                              '2017': medianPovRate2017
                             })
# preview table of all states with median poverty rates
groupMedianDF.to_csv(os.path.join('Output','medianStates.csv'))