In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [2]:
# Convert data to dataframes
lung_df1 = pd.read_csv(os.path.join("Resources", "CDC_Cancer1.csv"))
lung_df2 = pd.read_csv(os.path.join("Resources", "CDC_Cancer2.csv"))
county_df = pd.read_csv(os.path.join("Resources", "CDC_Cancer_by_County.csv"))

# Combine lung dataframes into single dataframe
lung_df = lung_df1.append(lung_df2, ignore_index=True)

# Delete unnecessary columns
lung_df = lung_df.drop(["Notes", "State Code", "Year Code", "Sex Code", "Race Code", "Age Group Code"], axis=1)
county_df = county_df.drop(["Notes", "County Code", "Year Code"], axis =1)

In [3]:
# Check to see if there are any null values in dataframe
lung_df.isnull().sum()
county_df.isnull().sum()

County        81
Year          81
Deaths        81
Population    81
Crude Rate    81
dtype: int64

In [4]:
# Drop rows that have nothing but null values
lung_df = lung_df.dropna(how="all")
county_df = county_df.dropna(how="all")
# Verify there are no null values left
lung_df.isnull().sum()
county_df.isnull().sum()

County        0
Year          0
Deaths        0
Population    0
Crude Rate    0
dtype: int64

In [8]:
# For county dataframe, remove unreliable rows and verify no more junk rows
county_df = county_df[county_df["Crude Rate"] != "Unreliable"]
county_df = county_df[county_df["Deaths"] != "Missing"]
county_df

  result = method(y)


Unnamed: 0,County,Year,Deaths,Population,Crude Rate
3,"Abbeville County, SC",2014,24,24965,96.1
4,"Abbeville County, SC",2013,23,25007,92.0
5,"Abbeville County, SC",2016,22,24872,88.5
6,"Acadia Parish, LA",2015,52,62577,83.1
7,"Acadia Parish, LA",2013,51,62204,82.0
8,"Acadia Parish, LA",2017,47,62590,75.1
9,"Acadia Parish, LA",2012,37,61912,59.8
10,"Acadia Parish, LA",2016,36,62645,57.5
11,"Acadia Parish, LA",2014,32,62486,51.2
12,"Accomack County, VA",2013,38,33148,114.6


In [9]:
# Cast "Year", "Deaths", and "Population" columns as integers; Crude Rate as float
lung_df["Year"] = lung_df["Year"].astype(int)
lung_df["Deaths"] = lung_df["Deaths"].astype(int)
lung_df["Population"] = lung_df["Population"].astype(int)
lung_df.head()

county_df["Year"] = county_df["Year"].astype(int)
county_df["Deaths"] = county_df["Deaths"].astype(int)
county_df["Population"] = county_df["Population"].astype(int)
county_df["Crude Rate"] = county_df["Crude Rate"].astype(float)
lung_df

Unnamed: 0,State,Year,Sex,Race,Age Group,Deaths,Population,Crude Rate
0,Alabama,2006,Female,American Indian or Alaska Native,< 1 year,0,131,0.0
1,Alabama,2006,Female,American Indian or Alaska Native,1-4 years,0,788,0.0
2,Alabama,2006,Female,American Indian or Alaska Native,5-9 years,0,1093,0.0
3,Alabama,2006,Female,American Indian or Alaska Native,10-14 years,0,1237,0.0
4,Alabama,2006,Female,American Indian or Alaska Native,15-19 years,0,1305,0.0
5,Alabama,2006,Female,American Indian or Alaska Native,20-24 years,0,1235,0.0
6,Alabama,2006,Female,American Indian or Alaska Native,25-29 years,0,1150,0.0
7,Alabama,2006,Female,American Indian or Alaska Native,30-34 years,0,1061,0.0
8,Alabama,2006,Female,American Indian or Alaska Native,35-39 years,0,1035,0.0
9,Alabama,2006,Female,American Indian or Alaska Native,40-44 years,0,1342,0.0


In [7]:
# Save cleaned data
lung_df.to_csv(os.path.join("Output", "Cleaned_CDC_Data.csv"), index=False, header = True)
county_df.to_csv(os.path.join("Output", "Cleaned_CDC_County_Data.csv"), index=False, header = True)