In [1]:
import pandas as pd
from collections import Counter

import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

In [45]:
# Import health dataset
pop_data = pd.read_csv('raw_data/Health_Data.csv')
pop_data.drop(columns=['Series Name', 'Series Code'], inplace=True)
pop_data.columns = pop_data.columns.str.rstrip('[YR')

pop_data.head(20)

Unnamed: 0,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],1996 [YR1996],1997 [YR1997],...,2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021]
0,Afghanistan,AFG,12412311,13299016,14485543,15816601,17075728,18110662,18853444,19357126,...,31161378,32269592,33370804,34413603,35383028,36296111,37171922,38041757,38928341,39835428
1,Albania,ALB,3286542,3266790,3247039,3227287,3207536,3187784,3168033,3148281,...,2900401,2895092,2889104,2880703,2876101,2873457,2866376,2854191,2837849,2811666
2,Algeria,DZA,25758872,26400468,27028330,27635517,28213777,28757788,29266415,29742980,...,37383899,38140135,38923688,39728020,40551398,41389174,42228415,43053054,43851043,44616626
3,American Samoa,ASM,47351,48682,49900,51025,52099,53158,54209,55227,...,55669,55717,55791,55806,55739,55617,55461,55312,55197,55103
4,Andorra,AND,54508,56666,58882,60974,62676,63860,64363,64318,...,82427,80770,79213,77993,77295,76997,77008,77146,77265,77354
5,Angola,AGO,11848385,12248901,12657361,13075044,13503753,13945205,14400722,14871572,...,25107925,26015786,26941773,27884380,28842482,29816769,30809787,31825299,32866268,33933611
6,Antigua and Barbuda,ATG,62533,63363,64459,65777,67201,68672,70176,71707,...,90407,91510,92562,93571,94520,95425,96282,97115,97928,98728
7,Argentina,ARG,32618648,33079002,33529320,33970103,34402669,34828168,35246376,35657438,...,41733271,42202935,42669500,43131966,43590368,44044811,44494502,44938712,45376763,45808747
8,Armenia,ARM,3538164,3505249,3442820,3363111,3283664,3217349,3168213,3133081,...,2884239,2897593,2912403,2925559,2936147,2944789,2951741,2957728,2963234,2968128
9,Aruba,ABW,62152,64623,68240,72495,76705,80324,83211,85450,...,102565,103165,103776,104339,104865,105361,105846,106310,106766,107195


In [3]:
# List of columns to use
cols = ['Entity',
'Code',
'Year',
'Age Group',
'Acute hepatitis',
'Cardiovascular diseases',
'Chronic kidney disease',
'Cirrhosis and other chronic liver diseases',
'Diabetes mellitus',
'Diarrheal diseases',
'Digestive diseases',
'Drowning',
'Environmental heat and cold exposure',
'Exposure to forces of nature',
'Fire, heat, and hot substances',
'HIV/AIDS',
'Interpersonal violence',
'Lower respiratory infections',
'Malaria',
'Neoplasms',
'Nutritional deficiencies',
'Parkinsons disease',
'Road injuries',
'Self',
'Tuberculosis']

In [4]:
# Import input dataset, add Age Group column, and clean header names
df_under5 = pd.read_csv('raw_data/causes-of-death-in-children-under-5.csv')
df_under5['Age Group'] = 'Under 5'

df_under5.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_under5.columns.values]
df_under5 = df_under5[df_under5.columns.intersection(cols)]
df_under5['Parkinsons disease'] = df_under5.get('Parkinsons disease', 0) 
df_under5['Self'] = df_under5.get('Self', 0) 

df_under5.columns = df_under5.columns.str.replace(" ", "_")
df_under5['Age_Group'] = pd.Series(df_under5['Age_Group'], dtype='string')
df_under5['Entity'] = pd.Series(df_under5['Entity'], dtype='string') 
df_under5['Code'] = pd.Series(df_under5['Code'], dtype='string')

column_to_move = df_under5.pop('Age_Group')
df_under5.insert(3, 'Age_Group', column_to_move)
df_under5.shape


(6840, 25)

In [5]:
# Import input dataset, add Age Group column, and clean header names
df_5to14 = pd.read_csv('raw_data/causes-of-death-in-5-14-year-olds.csv')
df_5to14['Age Group'] = '5-14'

df_5to14.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_5to14.columns.values]
df_5to14 = df_5to14[df_5to14.columns.intersection(cols)]
df_5to14['Parkinsons disease'] = df_5to14.get('Parkinsons disease', 0) 
df_5to14['Self'] = df_5to14.get('Self', 0) 

df_5to14.columns = df_5to14.columns.str.replace(" ", "_")
df_5to14['Age_Group'] = pd.Series(df_5to14['Age_Group'], dtype='string')
df_5to14['Entity'] = pd.Series(df_5to14['Entity'], dtype='string') 
df_5to14['Code'] = pd.Series(df_5to14['Code'], dtype='string') 

column_to_move = df_5to14.pop('Age_Group')
df_5to14.insert(3, 'Age_Group', column_to_move)

df_5to14.head()

Unnamed: 0,Entity,Code,Year,Age_Group,Self,Interpersonal_violence,Drowning,Malaria,"Fire,_heat,_and_hot_substances",Neoplasms,...,Nutritional_deficiencies,Diabetes_mellitus,Exposure_to_forces_of_nature,Environmental_heat_and_cold_exposure,Diarrheal_diseases,Road_injuries,Tuberculosis,HIV/AIDS,Acute_hepatitis,Parkinsons_disease
0,Afghanistan,AFG,1990,5-14,12,65,290,3,27,278,...,103,11,0,2,27,511,160,0,103,0
1,Afghanistan,AFG,1991,5-14,12,80,311,6,30,299,...,111,12,277,2,30,552,170,0,112,0
2,Afghanistan,AFG,1992,5-14,14,88,356,8,34,341,...,128,14,120,1,34,633,191,0,129,0
3,Afghanistan,AFG,1993,5-14,15,97,393,3,38,373,...,142,15,42,1,60,695,205,1,143,0
4,Afghanistan,AFG,1994,5-14,15,105,410,7,39,385,...,150,16,29,1,42,721,209,1,150,0


In [6]:
# Import input dataset, add Age Group column, and clean header names
df_15to49 = pd.read_csv('raw_data/causes-of-death-in-15-49-year-olds.csv')
df_15to49['Age Group'] = '15-49'

df_15to49.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_15to49.columns.values]
df_15to49 = df_15to49[df_15to49.columns.intersection(cols)]
df_15to49['Parkinsons disease'] = df_15to49.get('Parkinsons disease', 0) 
df_15to49['Self'] = df_15to49.get('Self', 0) 

df_15to49.columns = df_15to49.columns.str.replace(" ", "_")
df_15to49['Age_Group'] = pd.Series(df_15to49['Age_Group'], dtype='string')
df_15to49['Entity'] = pd.Series(df_15to49['Entity'], dtype='string') 
df_15to49['Code'] = pd.Series(df_15to49['Code'], dtype='string') 

column_to_move = df_15to49.pop('Age_Group')
df_15to49.insert(3, 'Age_Group', column_to_move)

df_15to49.shape

(6840, 25)

In [7]:
# Import input dataset, add Age Group column, and clean header names
df_50to69 = pd.read_csv('raw_data/causes-of-death-in-50-69-year-olds.csv')
df_50to69['Age Group'] = '50-69'

df_50to69.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_50to69.columns.values]
df_50to69 = df_50to69[df_50to69.columns.intersection(cols)]
df_50to69['Parkinsons disease'] = df_50to69.get('Parkinsons disease', 0) 
df_50to69['Self'] = df_50to69.get('Self', 0) 

df_50to69.columns = df_15to49.columns.str.replace(" ", "_")
df_50to69['Age_Group'] = pd.Series(df_50to69['Age_Group'], dtype='string')
df_50to69['Entity'] = pd.Series(df_50to69['Entity'], dtype='string') 
df_50to69['Code'] = pd.Series(df_50to69['Code'], dtype='string') 

column_to_move = df_50to69.pop('Age_Group')
df_50to69.insert(3, 'Age_Group', column_to_move)


df_50to69.shape

(6840, 25)

In [10]:
# Import input dataset, add Age Group column, and clean header names
df_70 = pd.read_csv('raw_data/causes-of-death-in-70-year-olds.csv')
df_70['Age Group'] = '70'
df_70.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_70.columns.values]
df_70['Age Group'] = pd.Series(df_70['Age Group'], dtype='string')
df_70['Entity'] = pd.Series(df_70['Entity'], dtype='string') 
df_70['Code'] = pd.Series(df_70['Code'], dtype='string') 
column_to_move = df_70.pop('Age Group')
df_70.insert(3, 'Age Group', column_to_move)
df_70 = df_70[df_70.columns.intersection(cols)]
df_70['Parkinsons disease'] = df_70.get('Parkinsons disease', 0) 
df_70['Self'] = df_70.get('Self', 0) 
df_70.columns

Index(['Entity', 'Code', 'Year', 'Age Group', 'Self', 'Interpersonal violence',
       'Exposure to forces of nature', 'Drowning',
       'Environmental heat and cold exposure', 'Diarrheal diseases',
       'Road injuries', 'Tuberculosis', 'HIV/AIDS', 'Malaria',
       'Fire, heat, and hot substances', 'Chronic kidney disease', 'Neoplasms',
       'Digestive diseases', 'Cirrhosis and other chronic liver diseases',
       'Cardiovascular diseases', 'Nutritional deficiencies',
       'Lower respiratory infections', 'Diabetes mellitus', 'Acute hepatitis',
       'Parkinsons disease'],
      dtype='object')

In [9]:
# Upload dataframes to database to be joined

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Capstone_DB"
engine = create_engine(db_string)

df_under5.to_sql(name='cod_under5', con=engine, if_exists='replace')
df_5to14.to_sql(name='cod_5to14', con=engine, if_exists='replace')
df_15to49.to_sql(name='cod_15to49', con=engine, if_exists='replace')
df_50to69.to_sql(name='cod_50to69', con=engine, if_exists='replace')
df_70.to_sql(name='cod_70', con=engine, if_exists='replace')

health_data.to_sql(name='health_data', con=engine, if_exists='replace')
