# <span style="font-family: cursive">Notebook Imports</span>

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# <span style="font-family: cursive">General Objectives</span>

 * Assessing Trends to Pinpoint Regions Experiencing Varied Patterns in Death Rates Due to Cancer.
 * How many Counties met the objective (achieving Death Rate of 45.5).
 * Identify Region where both Incidence and Death Rates coexists.
 * Temporal Trends and Time Series Analysis.
 * Group Regions based on similarities in Death and Incidence Rate.
 * Identify Regions where high incidence Rates align with high Death Rates
 * Identify Counties with lower or higher average Annual Count.
 * Predictive modeling

# <span style="font-family: cursive">Loading Datasets</span>

In [2]:
death_data = pd.read_csv('death.csv')
incd_data = pd.read_csv('incd.csv')

In [3]:
death_data.head(3)

Unnamed: 0,index,County,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,Lower 95% Confidence Interval for Trend,Upper 95% Confidence Interval for Trend
0,0,United States,0,No,46.0,45.9,46.1,157376,falling,-2.4,-2.6,-2.2
1,1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,-2.7,1.6
2,2,"Powell County, Kentucky",21197,No,125.3,100.2,155.1,18,stable,1.7,0.0,3.4


In [4]:
incd_data.head(3)

Unnamed: 0,index,County,FIPS,"Age-Adjusted Incidence Rate(Ê) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,Average Annual Count,Recent Trend,Recent 5-Year Trend (ˆ) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,0,"US (SEER+NPCR)(1,10)",0,62.4,62.3,62.6,214614,falling,-2.5,-3.0,-2.0
1,1,"Autauga County, Alabama(6,10)",1001,74.9,65.1,85.7,43,stable,0.5,-14.9,18.6
2,2,"Baldwin County, Alabama(6,10)",1003,66.9,62.4,71.7,170,stable,3.0,-10.2,18.3


In [5]:
death_data.info(), print('--'*35), incd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 12 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   index                                         3141 non-null   int64 
 1   County                                        3141 non-null   object
 2   FIPS                                          3141 non-null   int64 
 3   Met Objective of 45.5? (1)                    3141 non-null   object
 4   Age-Adjusted Death Rate                       3141 non-null   object
 5   Lower 95% Confidence Interval for Death Rate  3141 non-null   object
 6   Upper 95% Confidence Interval for Death Rate  3141 non-null   object
 7   Average Deaths per Year                       3141 non-null   object
 8   Recent Trend (2)                              3141 non-null   object
 9   Recent 5-Year Trend (2) in Death Rates        3141 non-null   object
 10  

(None, None, None)

# <span style="font-family: cursive">Merging DataFrames</span>

We're going to merge on FIPS which serves as a common identifier.
> The merged data will be used for exploratory analysis, or other relevant analysis\
> to study the relationship between incidence rates and death rates across counties.

In [6]:
# To start, the initial step involves eliminating any trailing spaces from the column_names.
incd_data.columns = incd_data.columns.str.strip()
death_data.columns = death_data.columns.str.strip()

de

In [7]:
incd_data['FIPS'].nunique(), death_data['FIPS'].nunique()

(3141, 3141)

In [8]:
incd_data.FIPS.isin(death_data.FIPS).sum()

3141

 **All 3141 FIPS values in our death_data perfectly align with the incidence dataframe. We're all set to proceed with merging them!**

In [9]:
# As an added precaution
incd_data.loc[incd_data['FIPS'] == 21193, 'County'], death_data.loc[death_data['FIPS'] == 21193, 'County']

(1088    Perry County, Kentucky(7,9)
 Name: County, dtype: object,
 1    Perry County, Kentucky
 Name: County, dtype: object)

In [10]:
# merging dataframes on FIPS
df = pd.merge(death_data, incd_data, on='FIPS', how='inner')
df.head()

Unnamed: 0,index_x,County_x,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,...,index_y,County_y,"Age-Adjusted Incidence Rate(Ê) - cases per 100,000",Lower 95% Confidence Interval,Upper 95% Confidence Interval,Average Annual Count,Recent Trend,Recent 5-Year Trend (ˆ) in Incidence Rates,Lower 95% Confidence Interval.1,Upper 95% Confidence Interval.1
0,0,United States,0,No,46.0,45.9,46.1,157376,falling,-2.4,...,0,"US (SEER+NPCR)(1,10)",62.4,62.3,62.6,214614,falling,-2.5,-3.0,-2.0
1,1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,...,1088,"Perry County, Kentucky(7,9)",139.7,122.2,159.1,49,falling,-15.0,-26.3,-1.9
2,2,"Powell County, Kentucky",21197,No,125.3,100.2,155.1,18,stable,1.7,...,1090,"Powell County, Kentucky(7,9)",152.9,125.7,184.6,23,stable,8.2,-7.3,26.2
3,3,"North Slope Borough, Alaska",2185,No,124.9,73.0,194.7,5,**,**,...,84,"North Slope Borough, Alaska(6,10)",153.4,95.2,229.4,6,stable,0.2,-34.3,52.7
4,4,"Owsley County, Kentucky",21189,No,118.5,83.1,165.5,8,stable,2.2,...,1086,"Owsley County, Kentucky(7,9)",148.1,107.8,199.8,9,stable,10.6,-13.1,40.9


In [11]:
df.drop(['index_y', 'County_y'], axis=1,  inplace=True)

# <span style="font-family: cursive">Clean and Prepare Data</span>
* Data Type Standardization
* Handling Missing Values
* Addressing Inconsistencies
* Renaming Columns

In [12]:
# Renaming Columns
df = df.rename(columns={
    'index_x' : 'index', 'County_x' : 'County',
    'Met Objective of 45.5? (1)' :'Objective',
    'Age-Adjusted Death Rate' : 'Adjusted_Age',
    'Lower 95% Confidence Interval for Death Rate' : 'Lower95%_DeathRate',
    'Upper 95% Confidence Interval for Death Rate' : 'Upper95%_DeathRate',
    'Average Deaths per Year' : 'Death_Rate',
    'Recent Trend (2)' : 'Recent_Trend_x',
    'Recent 5-Year Trend (2) in Death Rates' : 'Recent5YearTrend_x',
    'Lower 95% Confidence Interval for Trend' : 'Lower95%CI_Trend',
    'Upper 95% Confidence Interval for Trend' : 'Upper95%CI_Trend',
    'Age-Adjusted Incidence Rate(Ê) - cases per 100,000' : 'Adjusted_Age_Incd',
    'Lower 95% Confidence Interval'  : 'Lower95%CI_Incd',
    'Upper 95% Confidence Interval'  : 'Upper95%CI_Incd',
    'Average Annual Count' : 'Average_Annual_Count',
    'Recent Trend' : 'RecentTrend_y',
    'Recent 5-Year Trend (ˆ) in Incidence Rates' : 'Recent5YearTrend_y',
    'Lower 95% Confidence Interval.1' : 'Lower95%CI_Incd.1',
    'Upper 95% Confidence Interval.1' : 'Upper95%CI_Incd.1'
})

df.set_index('index', inplace=True)

In [13]:
df.head(2)

Unnamed: 0_level_0,County,FIPS,Objective,Adjusted_Age,Lower95%_DeathRate,Upper95%_DeathRate,Death_Rate,Recent_Trend_x,Recent5YearTrend_x,Lower95%CI_Trend,Upper95%CI_Trend,Adjusted_Age_Incd,Lower95%CI_Incd,Upper95%CI_Incd,Average_Annual_Count,RecentTrend_y,Recent5YearTrend_y,Lower95%CI_Incd.1,Upper95%CI_Incd.1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,United States,0,No,46.0,45.9,46.1,157376,falling,-2.4,-2.6,-2.2,62.4,62.3,62.6,214614,falling,-2.5,-3.0,-2.0
1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,-2.7,1.6,139.7,122.2,159.1,49,falling,-15.0,-26.3,-1.9


## <span style="font-family :cursive">Features Description</span>

>* **County** --> Name of geographical region.
>* **FIPS**       --> Federal Information Processing Standards, standralized set of codes uniquely identify counties.
>* **Objective**         --> Whether the county met the objective of a death rate of 45.5.
>* **Adjusted_Age**       --> Age-adjusted death rate for cancer in the county.
>* **Lower95%_DeathRate** --> Lower Boundary esitmate of death rate within a 95% confidence interval.
>* **Upper95%_DeathRate** --> Upper Boundary estimate of death rate within a 95% confidence interval.
>* **Death_Rate**         --> Average Number of Deaths caused by cancer.
>* **Recent_Trend_x**     --> Increase, Decrease, or Stable of the cancer death rate.
>* **Recent5YearTrend_x** --> Trend in Death rates over last five years.
>* **Lower95%CI_Trend**   --> Lower boundary estimate of the Trend in death rates within 95% CI.
>* **Upper95%CI_Trend**   --> Upper boundary estimate of the Trend in death rates within 95% CI.
>* **Ajusted_Age_Incd**   --> The rate of new cases of cancer reported per 100,000, adjusted to account for age difference between populations.
>* **Average_Annual_Count**--> New cancer cases reported annually in the county.

In [17]:
# Standarizing dtypes# convert the dtypes to integer
numerical_col = [
    'Adjusted_Age', 'Lower95%_DeathRate',
    'Upper95%_DeathRate', 'Death_Rate',
    'Recent5YearTrend_x', 'Lower95%CI_Trend',
    'Upper95%CI_Trend', 'Adjusted_Age_Incd',
    'Lower95%CI_Incd', 'Upper95%CI_Incd',
    'Average_Annual_Count','Recent5YearTrend_y',
    'Lower95%CI_Incd.1', 'Upper95%CI_Incd.1'
]

for column in numerical_col:
   df[column] = df[column].apply(lambda x : pd.to_numeric(x, errors='coerce'))

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3141 entries, 0 to 3140
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   County                3141 non-null   object 
 1   FIPS                  3141 non-null   int64  
 2   Objective             3141 non-null   object 
 3   Adjusted_Age          2813 non-null   float64
 4   Lower95%_DeathRate    2813 non-null   float64
 5   Upper95%_DeathRate    2813 non-null   float64
 6   Death_Rate            2806 non-null   float64
 7   Recent_Trend_x        3141 non-null   object 
 8   Recent5YearTrend_x    2694 non-null   float64
 9   Lower95%CI_Trend      2694 non-null   float64
 10  Upper95%CI_Trend      2694 non-null   float64
 11  Adjusted_Age_Incd     2640 non-null   float64
 12  Lower95%CI_Incd       2719 non-null   float64
 13  Upper95%CI_Incd       2719 non-null   float64
 14  Average_Annual_Count  2719 non-null   float64
 15  RecentTrend_y         3141

In [19]:
df.shape

(3141, 19)

In [20]:
df.isnull().sum()

County                    0
FIPS                      0
Objective                 0
Adjusted_Age            328
Lower95%_DeathRate      328
Upper95%_DeathRate      328
Death_Rate              335
Recent_Trend_x            0
Recent5YearTrend_x      447
Lower95%CI_Trend        447
Upper95%CI_Trend        447
Adjusted_Age_Incd       501
Lower95%CI_Incd         422
Upper95%CI_Incd         422
Average_Annual_Count    422
RecentTrend_y             0
Recent5YearTrend_y      469
Lower95%CI_Incd.1       469
Upper95%CI_Incd.1       469
dtype: int64

In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
FIPS,3141.0,30392.281757,15158.200839,0.0,18179.0,29177.0,45081.0,56045.0
Adjusted_Age,2813.0,53.18514,14.033997,9.2,43.7,52.6,61.2,125.6
Lower95%_DeathRate,2813.0,42.795165,12.38798,6.4,34.4,42.9,50.8,108.9
Upper95%_DeathRate,2813.0,66.269499,18.617687,12.8,53.3,64.0,77.0,194.7
Death_Rate,2806.0,52.272274,97.78903,3.0,10.0,21.0,47.75,979.0
Recent5YearTrend_x,2694.0,-1.003526,2.371872,-33.4,-1.6,-0.8,-0.1,31.1
Lower95%CI_Trend,2694.0,-2.755939,3.444109,-60.9,-3.0,-2.1,-1.4,6.9
Upper95%CI_Trend,2694.0,0.837305,3.083378,-10.3,-0.3,0.6,1.6,78.1
Adjusted_Age_Incd,2640.0,69.93947,17.971272,13.5,58.375,69.5,80.125,203.7
Lower95%CI_Incd,2719.0,57.694226,16.405942,7.6,46.6,58.3,68.2,172.4


After having carefully gone over the datasets' provided details:
1. *, ** symbols represent missing data due to sparsity or privacy concerns.
     * It's essential to handle these symbols appropriately during data processing.
2. Classification of Trends
    * Rising --> When the 95% CI of average annual percent is above 0
    * Stable --> When the 95% CI  of average annual percent is includes 0
    * Falling --> When the 95% CI of average annual percent is below 0
36. Age GroupsModifiation:
    * The age groups are categorized as (<1, 1-4, 5-9, ..., 80-84, 85+).
4. The dataset lacks information for the US state of Nevada.)