In [1]:
import numpy as np
import pandas as pd

def load_df(start_year, end_year):
    df = pd.DataFrame.from_csv(
        'Compressed Mortality, {}-{}.txt'.format(str(start_year), str(end_year)),
        sep='\t',
        header=0,
        index_col=False
    )
    
    # clean up
    del df['Year Code']
    del df['Cause of death Code']
    del df['Notes']
    df = df[pd.notnull(df['Year'])]
    df.Year = df.Year.astype(int)
    df.Deaths = df.Deaths.astype(int)
    df.Population = df.Population.astype(int)
    df['Crude Rate'] = df['Crude Rate'].astype(str)
    df['Crude Rate Unreliable'] = df['Crude Rate'].str.contains('Unreliable')
    df['Crude Rate'] = df['Crude Rate'].str.split(' ').str.get(0).astype(float)
    
    # make sure all data frames have same names for same diseases
    
    
    return df

data_temp = [
    load_df(1968,1978),
    load_df(1979,1998),
    load_df(1999,2014)
]

df = pd.concat(data_temp)

df.head()

Unnamed: 0,Year,Cause of death,Deaths,Population,Crude Rate,Crude Rate Unreliable
0,1968,Typhoid fever,7,199533564,0.0,True
1,1968,Paratyphoid fever B,4,199533564,0.0,True
2,1968,Paratyphoid fever C,1,199533564,0.0,True
3,1968,With food as vehicle of infection,2,199533564,0.0,True
4,1968,Without mention of food as vehicle of infection,63,199533564,0.0,False


In [2]:
athero_causes = []
for cause in df['Cause of death'].unique():
    if 'atheroscler' in cause.lower():
        print cause
        athero_causes.append(cause)

Coronary atherosclerosis
Cerebral atherosclerosis
Generalized and unspecified atherosclerosis
Atherosclerotic cardiovascular disease, so described
Atherosclerotic heart disease
Atherosclerosis of aorta
Atherosclerosis of renal artery
Atherosclerosis of arteries of extremities
Atherosclerosis of other arteries


In [3]:
# Lump together all forms of Atherosclerosis
for cause in df['Cause of death'].unique():
    if 'atheroscler' in cause.lower():
        df['Cause of death'][df['Cause of death'] == cause] = 'Atherosclerosis'

# check
for cause in df['Cause of death'].unique():
    if 'atheroscler' in cause.lower():
        print cause

Atherosclerosis


In [4]:
bronchus_causes = []
for cause in df['Cause of death'].unique():
    if 'bronchus' in cause.lower() or 'lung' in cause.lower() or 'pulmonary' in cause.lower():
        print cause
        bronchus_causes.append(cause)

Miliary, lung NOS
Pulmonary tuberculosis NOS
Pulmonary
Of lung
Bronchus and lung
Lung, specified as secondary
Pulmonary heart disease
Hemorrhagic pulmonary renal syndrome
Pulmonary embolism and infarction
Abscess of lung
Pulmonary congestion and hypostasis
Pulmonary collapse
Acute edema of lung
Other diseases of lung
Stenosis or atresia of pulmonary artery
Other anomalies of larynx, trachea, and bronchus
Congenital cystic lung
Agenesis of lung
Other anomalies of lung
Chronic obstructive lung disease without mention of asthma, bronchitis, or emphysema
Hypertrophic pulmonary osteoarthropathy
Acute miliary tuberculosis specified as nonpulmonary
Tuberculosis of lung with cavitation
Tuberculosis of bronchus
Tuberculous fibrosis of lung
Other specified pulmonary tuberculosis
Pulmonary tuberculosis, unspecified
Main bronchus
Upper lobe, bronchus or lung
Middle lobe, bronchus or lung
Lower lobe, bronchus or lung
Other parts of bronchus or lung
Bronchus and lung, unspecified
Trachea, bronchus, 

In [5]:
# Lump together all forms of Bronchus/Lung disorders
for cause in df['Cause of death'].unique():
    if 'bronchus' in cause.lower() or 'lung' in cause.lower() or 'pulmonary' in cause.lower():
        df['Cause of death'][df['Cause of death'] == cause] = 'Bronchus or Lung'

# check
for cause in df['Cause of death'].unique():
    if 'bronchus' in cause.lower() or 'lung' in cause.lower() or 'pulmonary' in cause.lower():
        print cause

Bronchus or Lung


In [6]:
# dementia and alzheimer's next
for cause in df['Cause of death'].unique():
    if 'dementia' in cause.lower() or 'alzheimer' in cause.lower():
        print cause

Senile dementia
Presenile dementia
Senile dementia, uncomplicated
Senile dementia with delusional or depressive features
Senile dementia with delirium
Arteriosclerotic dementia
Other alcoholic dementia
Alzheimer's disease
Multi-infarct dementia
Subcortical vascular dementia
Mixed cortical and subcortical vascular dementia
Other vascular dementia
Vascular dementia, unspecified
Unspecified dementia
Delirium superimposed on dementia
Alzheimer's disease with early onset
Alzheimer's disease with late onset
Other Alzheimer's disease
Alzheimer's disease, unspecified
Vascular dementia of acute onset
Delirium not superimposed on dementia, so described


In [7]:
for cause in df['Cause of death'].unique():
    if 'dementia' in cause.lower() or 'alzheimer' in cause.lower():
        df['Cause of death'][df['Cause of death'] == cause] = "Dementia/Alzheimer's Disease"
for cause in df['Cause of death'].unique():
    if 'dementia' in cause.lower() or 'alzheimer' in cause.lower():
        print cause

Dementia/Alzheimer's Disease


In [8]:
heartattack_causes = []
for cause in df['Cause of death'].unique():
    if 'myocardial' in cause.lower() and 'infarction' in cause.lower():
        print cause
        heartattack_causes.append(cause)

Acute myocardial infarction
Old myocardial infarction
Acute transmural myocardial infarction of anterior wall
Acute transmural myocardial infarction of inferior wall
Acute transmural myocardial infarction of other sites
Acute transmural myocardial infarction of unspecified site
Acute subendocardial myocardial infarction
Acute myocardial infarction, unspecified
Subsequent myocardial infarction of inferior wall
Subsequent myocardial infarction of unspecified site
Subsequent myocardial infarction of anterior wall
Subsequent myocardial infarction of other sites


In [9]:
for cause in df['Cause of death'].unique():
    if 'myocardial' in cause.lower() and 'infarction' in cause.lower():
        df['Cause of death'][df['Cause of death'] == cause] = 'Heart Attack'
for cause in df['Cause of death'].unique():
    if 'heart attack' in cause.lower() or 'myocardial' in cause.lower() and 'infarction' in cause.lower():
        print cause

Heart Attack


In [10]:
for cause in df['Cause of death'].unique():
    if (
        'stroke' in cause.lower() or
        (
            'cerebral' in cause.lower() and (
                'infarction' in cause.lower() or
                'thrombosis' in cause.lower()
            )
        ) or
        cause == 'Acute, but ill-defined, cerebrovascular disease'
    ):
        print cause

Cerebral thrombosis
Acute, but ill-defined, cerebrovascular disease
Cerebral infarction due to thrombosis of precerebral arteries
Cerebral infarction due to embolism of precerebral arteries
Cerebral infarction due to unspecified occlusion or stenosis of precerebral arteries
Cerebral infarction due to thrombosis of cerebral arteries
Cerebral infarction due to embolism of cerebral arteries
Cerebral infarction due to unspecified occlusion or stenosis of cerebral arteries
Other cerebral infarction
Cerebral infarction, unspecified
Stroke, not specified as haemorrhage or infarction
Sequelae of cerebral infarction
Sequelae of stroke, not specified as haemorrhage or infarction
Cerebral infarction due to cerebral venous thrombosis, nonpyogenic
Cerebral venous thrombosis in pregnancy
Cerebral venous thrombosis in the puerperium


In [11]:
for cause in df['Cause of death'].unique():
    if (
        'stroke' in cause.lower() or
        (
            'cerebral' in cause.lower() and (
                'infarction' in cause.lower() or
                'thrombosis' in cause.lower()
            )
        ) or
        cause == 'Acute, but ill-defined, cerebrovascular disease'
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Stroke"
for cause in df['Cause of death'].unique():
    if (
        'stroke' in cause.lower() or
        (
            'cerebral' in cause.lower() and (
                'infarction' in cause.lower() or
                'thrombosis' in cause.lower()
            )
        ) or
        cause == 'Acute, but ill-defined, cerebrovascular disease'
    ):
        print cause

Stroke


In [12]:
heart_causes = []
for cause in df['Cause of death'].unique():
    if (
        (
            'heart' in cause.lower() or
            'cardiovas' in cause.lower() or
            'hyperten' in cause.lower()
        ) and
        'heart attack' not in cause.lower() and
        'without mention of heart' not in cause.lower() and
        'without heart' not in cause.lower() and
        'heartburn' not in cause.lower()
    ):
        print cause
        heart_causes.append(cause)

Other cardiovascular syphilis
Cardiovascular
Other active rheumatic heart disease
Other heart disease, specified as rheumatic
With heart involvement
Essential benign hypertension
Hypertensive heart disease
Hypertensive renal disease
Hypertensive heart and renal disease
With hypertensive disease
Without mention of hypertensive disease
With or without cardiovascular disease with hypertension
Cardiovascular disease without mention of chronic ischemic heart disease with hypertensive disease
With or without cardiovascular disease without mention of hypertensive disease
Cardiovascular disease without mention of chronic ischemic heart disease without mention of hypertensive disease
Congestive heart failure
Other heart block
Other and unspecified disorders or heart rhythm
Other and unspecified heart disease
With hypertension (benign)
Without mention of hypertension
Anomalies of heart valve
Other specified anomalies of heart
Unspecified anomalies of heart
Chronic hypertension
Acute heart failur

In [13]:
for cause in df['Cause of death'].unique():
    if (
        (
            'heart' in cause.lower() or
            'cardiovas' in cause.lower() or
            'hyperten' in cause.lower()
        ) and
        'heart attack' not in cause.lower() and
        'without mention of heart' not in cause.lower() and
        'without heart' not in cause.lower() and
        'heartburn' not in cause.lower()
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Other Heart/Cardiovascular"
for cause in df['Cause of death'].unique():
    if (
        (
            'heart' in cause.lower() or
            'cardiovas' in cause.lower() or
            'hyperten' in cause.lower()
        ) and
        'heart attack' not in cause.lower() and
        'without mention of heart' not in cause.lower() and
        'without heart' not in cause.lower() and
        'heartburn' not in cause.lower()
    ):
        print cause

Other Heart/Cardiovascular


In [14]:
for cause in df['Cause of death'].unique():
    if 'pneumonia' in cause.lower():
        print cause

Influenza with pneumonia
Viral pneumonia
Pneumococcal pneumonia
Pneumonia due to other specified organism
Acute interstitial pneumonia
Bronchopneumonia, unspecified
Pneumonia, unspecified
Other chronic interstitial pneumonia
Tuberculous pneumonia [any form]
Pneumococcal septicemia [streptococcus pneumoniae septicemia]
Postmeasles pneumonia
Pneumonia due to adenovirus
Pneumonia due to respiratory syncytial virus
Pneumonia due to other virus not elsewhere classified
Viral pneumonia, unspecified
Pneumococcal pneumonia [streptococcus pneumoniae pneumonia]
Pneumonia due to klebsiella pneumoniae
Pneumonia due to pseudomonas
Pneumonia due to hemophilus influenzae [h. influenzae]
Pneumonia due to streptococcus
Pneumonia due to staphylococcus
Pneumonia due to other specified bacteria
Bacterial pneumonia unspecified
Bronchopneumonia, organism unspecified
Pneumonia, organism unspecified
With pneumonia
Congenital pneumonia
Pneumonia due to parainfluenza virus
Septicaemia due to Streptococcus pneum

In [15]:
for cause in df['Cause of death'].unique():
    if 'pneumonia' in cause.lower():
        df['Cause of death'][df['Cause of death'] == cause] = "Pneumonia"
for cause in df['Cause of death'].unique():
    if 'pneumonia' in cause.lower():
        print cause

Pneumonia


In [16]:
for cause in df['Cause of death'].unique():
    if 'breast' in cause.lower():
        print cause

Malignant neoplasm of breast
Benign neoplasm of breast
Neoplasm of unspecified nature of breast
Chronic cystic disease of breast
Hypertrophy of breast
Other specified sites of female breast
Breast (female), unspecified
Malignant neoplasm of male breast
Breast
Fibroadenosis of breast
Inflammatory disease of breast
Signs and symptoms in breast
Unspecified breast disorder
Specified anomalies of breast
Noxious influences affecting fetus via placenta or breast milk
Solitary cyst of breast
Other specified disorders of breast
Breast engorgement in newborn
Fat necrosis of breast
Infections of the breast and nipple associated with childbirth
Central portion of breast - Malignant neoplasms
Upper-outer quadrant of breast - Malignant neoplasms
Breast, unspecified - Malignant neoplasms
Carcinoma in situ of breast, unspecified - In situ neoplasms
Breast - Uncertain neoplasms
Inflammatory disorders of breast
Unspecified lump in breast
Other signs and symptoms in breast
Disorder of breast, unspecified

This looks like it could be a bit tricky... Breast cancer is one of the top causes of death, so I want to separate that from other breast related causes of death, but first let's look at the vague ones that may or may not be breast cancer to see if we can leave them out

In [17]:
df[df['Cause of death'] == 'Breast']

Unnamed: 0,Year,Cause of death,Deaths,Population,Crude Rate,Crude Rate Unreliable
581,1979,Breast,1,224635398,0.0,True
608,1979,Breast,3,224635398,0.0,True
617,1979,Breast,4,224635398,0.0,True
3438,1980,Breast,2,226624371,0.0,True
3465,1980,Breast,6,226624371,0.0,True
3474,1980,Breast,1,226624371,0.0,True
6279,1981,Breast,1,229487512,0.0,True
6302,1981,Breast,5,229487512,0.0,True
9142,1982,Breast,1,231701425,0.0,True
9165,1982,Breast,4,231701425,0.0,True


The death counts on that one are low enough that I think it is safe to leave it out. Let's check `'Breast (female), unspecified'`

In [18]:
df[df['Cause of death'] == 'Breast (female), unspecified']

Unnamed: 0,Year,Cause of death,Deaths,Population,Crude Rate,Crude Rate Unreliable
391,1979,"Breast (female), unspecified",34381,224635398,15.3,False
3254,1980,"Breast (female), unspecified",35628,226624371,15.7,False
6089,1981,"Breast (female), unspecified",36472,229487512,15.9,False
8964,1982,"Breast (female), unspecified",37401,231701425,16.1,False
11787,1983,"Breast (female), unspecified",37965,233781743,16.2,False
14603,1984,"Breast (female), unspecified",39463,235922142,16.7,False
17415,1985,"Breast (female), unspecified",40090,238005715,16.8,False
20187,1986,"Breast (female), unspecified",40526,240189882,16.9,False
23012,1987,"Breast (female), unspecified",40881,242395034,16.9,False
25791,1988,"Breast (female), unspecified",42153,244651961,17.2,False


Okay, this one has much more significant numbers. All of these rows are from the 1979-1998 data set, which means the same thing is probably labeled differently in the other ones, perhaps as seen in 1999-2014, `'Breast, unspecified - Malignant neoplasms'`.

So, I'm just going to go out on a limb here and assume that these cases are breast cancer and not some other breast-related cause of death.

In [19]:
for cause in df['Cause of death'].unique():
    if (
        'breast' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower()) or
        cause == 'Breast (female), unspecified'
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Breast Cancer"
for cause in df['Cause of death'].unique():
    if (
        'breast' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower()) or
        cause == 'Breast (female), unspecified'
    ):
        print cause

Breast Cancer


In [20]:
for cause in df['Cause of death'].unique():
    if 'pancrea' in cause.lower():
        print cause

Pancreas
Disorders of pancreatic internal secretion other than diabetes mellitus
Acute pancreatitis
Chronic pancreatitis
Other diseases of pancreas
Anomalies of pancreas
Head of pancreas
Body of pancreas
Tail of pancreas
Pancreatic duct
Other specified sites of pancreas
Pancreas, part unspecified
Pancreas, except islets of langerhans
Other specified disorders of pancreatic internal secretion
Unspecified disorder of pancreatic internal secretion
Cyst and pseudocyst of pancreas
Other specified diseases of pancreas
Unspecified disease of pancreas
Mumps pancreatitis
Pancreatic steatorrhea
Head of pancreas - Malignant neoplasms
Body of pancreas - Malignant neoplasms
Tail of pancreas - Malignant neoplasms
Pancreatic duct - Malignant neoplasms
Endocrine pancreas - Malignant neoplasms
Overlapping lesion of pancreas - Malignant neoplasms
Pancreas, unspecified - Malignant neoplasms
Pancreas - Benign neoplasms
Endocrine pancreas - Benign neoplasms
Alcohol-induced chronic pancreatitis
Other chroni

In [21]:
for cause in df['Cause of death'].unique():
    if (
        'pancrea' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Pancreatic Cancer"
for cause in df['Cause of death'].unique():
    if (
        'pancrea' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        print cause

Pancreatic Cancer


In [22]:
for cause in df['Cause of death'].unique():
    if (
        'colon' in cause.lower()# and
        #('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        print cause

Cecum, appendix and ascending colon
Transverse colon, including hepatic and splenic flexures
Descending colon
Sigmoid colon
Large intestine (including colon), part unspecified
Colon
Irritable colon
Arizona group of paracolon bacilli
Transverse colon
Ascending colon
Colon, unspecified
Megacolon, other than hirschsprung's
Hirschsprung's disease and other congenital functional disorders of colon
Ascending colon - Malignant neoplasms
Transverse colon - Malignant neoplasms
Descending colon - Malignant neoplasms
Sigmoid colon - Malignant neoplasms
Overlapping lesion of colon - Malignant neoplasms
Colon, unspecified - Malignant neoplasms
Colon - In situ neoplasms
Ascending colon - Benign neoplasms
Transverse colon - Benign neoplasms
Colon, unspecified - Benign neoplasms
Colon - Uncertain neoplasms
Angiodysplasia of colon
Megacolon, not elsewhere classified
Sigmoid colon - Benign neoplasms
Pseudopolyposis of colon
Polyp of colon
Other congenital functional disorders of colon


In [23]:
for cause in df['Cause of death'].unique():
    if (
        'colon' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower()) or
        cause == 'Colon, unspecified' # for same reason as breast cancer above
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Colon Cancer"
for cause in df['Cause of death'].unique():
    if (
        'colon' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        print cause

Colon Cancer


In [24]:
for cause in df['Cause of death'].unique():
    if (
        'septicaemia' in cause.lower() or
        'sepsis' in cause.lower() or
        'septicemia' in cause.lower()
    ):
        print cause

Other bacterial septicemia
Abdominal pregnancy, with sepsis
Tubal pregnancy, with sepsis
Other or unspecified ectopic pregnancy, with sepsis
Abdominal pregnancy, without mention of sepsis
Tubal pregnancy, without mention of sepsis
Other or unspecified ectopic pregnancy, without mention of sepsis
With sepsis
Without mention of sepsis or toxemia
With sepsis and hemorrhage
With laceration of pelvic organ (with or without mention of sepsis or hemorrhage)
Without mention of sepsis, hemorrhage, or laceration
With sepsis and toxemia
Sepsis of childbirth and the puerperium
Ovarian pregnancy, without mention of sepsis
Salmonella septicemia
Streptococcal septicemia
Staphylococcal septicemia
Septicemia due to anaerobes
Septicemia due to other gram-negative organisms
Other specified septicemias
Unspecified septicemia
Herpetic septicemia
Salmonella septicaemia
Listerial septicaemia
Septicaemia due to streptococcus, group A
Septicaemia due to streptococcus, group B
Septicaemia due to streptococcus, 

In [25]:
for cause in df['Cause of death'].unique():
    if (
        'septicaemia' in cause.lower() or
        'sepsis' in cause.lower() or
        'septicemia' in cause.lower()
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Sepsis"
for cause in df['Cause of death'].unique():
    if (
        'septicaemia' in cause.lower() or
        'sepsis' in cause.lower() or
        'septicemia' in cause.lower()
    ):
        print cause

Sepsis


In [26]:
for cause in df['Cause of death'].unique():
    if (
        'diabet' in cause.lower() and
        'insipidus' not in cause.lower() and
        'other than diabetes' not in cause.lower() and
        'antidiabet' not in cause.lower() and
        'nondiabet' not in cause.lower() and
        'mother' not in cause.lower()
    ):
        print cause

Diabetes mellitus
Diabetes mellitus without mention of complication
Diabetes with ketoacidosis
Diabetes with hyperosmolarity
Diabetes with other coma
Diabetes with renal manifestations
Diabetes with ophthalmic manifestations
Diabetes with neurological manifestations
Diabetes with peripheral circulatory disorders
Diabetes with unspecified complication
Neonatal diabetes mellitus
Insulin-dependent diabetes mellitus, with coma
Insulin-dependent diabetes mellitus, with ketoacidosis
Insulin-dependent diabetes mellitus, with renal complications
Insulin-dependent diabetes mellitus, with ophthalmic complications
Insulin-dependent diabetes mellitus, with neurological complications
Insulin-dependent diabetes mellitus, with peripheral circulatory complications
Insulin-dependent diabetes mellitus, with other specified complications
Insulin-dependent diabetes mellitus, with multiple complications
Insulin-dependent diabetes mellitus, with unspecified complications
Insulin-dependent diabetes mellitus,

In [27]:
for cause in df['Cause of death'].unique():
    if (
        'diabet' in cause.lower() and
        'insipidus' not in cause.lower() and
        'other than diabetes' not in cause.lower() and
        'antidiabet' not in cause.lower() and
        'nondiabet' not in cause.lower() and
        'mother' not in cause.lower()
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Diabetes Mellitus"
for cause in df['Cause of death'].unique():
    if (
        'diabet' in cause.lower() and
        'insipidus' not in cause.lower() and
        'other than diabetes' not in cause.lower() and
        'antidiabet' not in cause.lower() and
        'nondiabet' not in cause.lower() and
        'mother' not in cause.lower()
    ):
        print cause

Diabetes Mellitus


In [28]:
for cause in df['Cause of death'].unique():
    if (
        'prostat' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        print cause

Malignant neoplasm of prostate
Prostate - In situ neoplasms
Prostate - Uncertain neoplasms
Prostate - Benign neoplasms


In [29]:
for cause in df['Cause of death'].unique():
    if (
        'prostat' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        df['Cause of death'][df['Cause of death'] == cause] = "Prostate Cancer"
for cause in df['Cause of death'].unique():
    if (
        'prostat' in cause.lower() and
        ('cancer' in cause.lower() or 'neoplasm' in cause.lower())
    ):
        print cause

Prostate Cancer


In [30]:
deaths_by_year = df.groupby(['Year','Cause of death']) \
                   .aggregate(sum) \
                   .sort_values(by='Deaths', ascending=False) \
                   .loc[:,['Deaths']]
deaths_by_year.loc[1979].head(10)

Unnamed: 0_level_0,Deaths
Cause of death,Unnamed: 1_level_1
Heart Attack,301625.0
Atherosclerosis,247654.0
Other Heart/Cardiovascular,139581.0
Bronchus or Lung,120615.0
Stroke,107717.0
Pneumonia,44992.0
Unspecified,35862.0
Colon Cancer,35078.0
Breast Cancer,34608.0
Diabetes Mellitus,33194.0


In [31]:
deaths_by_year.loc[1968].head(10)

Unnamed: 0_level_0,Deaths
Cause of death,Unnamed: 1_level_1
Other Heart/Cardiovascular,942150.0
Bronchus or Lung,81225.0
Pneumonia,73061.0
Without mention of acidosis or coma,34504.0
Unspecified,34290.0
Generalized and unspecified,33311.0
Breast Cancer,29092.0
Emphysema,24185.0
Part unspecified,22132.0
Other and unspecified,21014.0


In [32]:
deaths_by_year.loc[2014].head(15).index

Index([u'Bronchus or Lung', u'Atherosclerosis',
       u'Dementia/Alzheimer's Disease', u'Other Heart/Cardiovascular',
       u'Heart Attack', u'Stroke', u'Diabetes Mellitus', u'Pneumonia',
       u'Breast Cancer', u'Colon Cancer', u'Pancreatic Cancer', u'Sepsis',
       u'Prostate Cancer', u'Malignant neoplasm without specification of site',
       u'Parkinson's disease'],
      dtype='object', name=u'Cause of death')

In [33]:
deaths_by_year.loc[1979].head(15).index

Index([u'Heart Attack', u'Atherosclerosis', u'Other Heart/Cardiovascular',
       u'Bronchus or Lung', u'Stroke', u'Pneumonia', u'Unspecified',
       u'Colon Cancer', u'Breast Cancer', u'Diabetes Mellitus',
       u'Chronic airway obstruction, not elsewhere classified',
       u'Other and unspecified firearm', u'Other', u'Prostate Cancer',
       u'Cardiac arrest'],
      dtype='object', name=u'Cause of death')

In [34]:
# check to see if top 10 are same for all years
for year in range(1968,2014):
    same = np.all(
        np.unique(deaths_by_year.loc[year].head(10).index) ==
        np.unique(deaths_by_year.loc[year+1].head(10).index)
    )
    print '%d == %d:' % (year, year+1), same

1968 == 1969: True
1969 == 1970: False
1970 == 1971: True
1971 == 1972: False
1972 == 1973: True
1973 == 1974: False
1974 == 1975: True
1975 == 1976: True
1976 == 1977: False
1977 == 1978: True
1978 == 1979: False
1979 == 1980: True
1980 == 1981: False
1981 == 1982: True
1982 == 1983: False
1983 == 1984: False
1984 == 1985: True
1985 == 1986: True
1986 == 1987: True
1987 == 1988: True
1988 == 1989: False
1989 == 1990: True
1990 == 1991: True
1991 == 1992: True
1992 == 1993: True
1993 == 1994: True
1994 == 1995: False
1995 == 1996: True
1996 == 1997: True
1997 == 1998: False
1998 == 1999: False
1999 == 2000: True
2000 == 2001: True
2001 == 2002: True
2002 == 2003: True
2003 == 2004: True
2004 == 2005: True
2005 == 2006: True
2006 == 2007: True
2007 == 2008: True
2008 == 2009: True
2009 == 2010: True
2010 == 2011: True
2011 == 2012: True
2012 == 2013: True
2013 == 2014: True


In [35]:
deaths_by_year.loc[(1999, 'Heart Attack')]

Deaths    199818.0
Name: (1999, Heart Attack), dtype: float64

In [36]:
# make a csv where each row is a disease and each column is a year
diseases = np.unique(deaths_by_year.loc[2014].head(10).index)
start_year = 1979
end_year = 2014
header = ['disease'] + [str(year) for year in range(start_year,end_year+1)]
csv_list = [header]
for disease in diseases:
    row = [disease] + [deaths_by_year.loc[(year, disease)].values[0] for year in range(start_year,end_year+1)]
    csv_list.append(row)

import csv
with open('death_data_year_cols.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(csv_list)

In [37]:
# make a csv where each row is a year and each column is a disease
header = ['year'] + [disease for disease in diseases]
csv_list = [header]
for year in range(start_year,end_year+1):
    row = [year] + [deaths_by_year.loc[(year, disease)].values[0] for disease in diseases]
    csv_list.append(row)

with open('death_data_cause_cols.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(csv_list)