In [113]:
import pandas as pd
from collections import Counter

import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

In [114]:
# Import health dataset
health_data = pd.read_csv('raw_data/Health_Data.csv')
health_data.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],...,2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021]
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,12412311,13299016,14485543,15816601,17075728,18110662,...,31161378,32269592,33370804,34413603,35383028,36296111,37171922,38041757,38928341,39835428
1,"Population, total",SP.POP.TOTL,Albania,ALB,3286542,3266790,3247039,3227287,3207536,3187784,...,2900401,2895092,2889104,2880703,2876101,2873457,2866376,2854191,2837849,2811666
2,"Population, total",SP.POP.TOTL,Algeria,DZA,25758872,26400468,27028330,27635517,28213777,28757788,...,37383899,38140135,38923688,39728020,40551398,41389174,42228415,43053054,43851043,44616626
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,47351,48682,49900,51025,52099,53158,...,55669,55717,55791,55806,55739,55617,55461,55312,55197,55103
4,"Population, total",SP.POP.TOTL,Andorra,AND,54508,56666,58882,60974,62676,63860,...,82427,80770,79213,77993,77295,76997,77008,77146,77265,77354


In [115]:
# Import input dataset, add Age Group column, and clean header names
df_under5 = pd.read_csv('raw_data/causes-of-death-in-children-under-5.csv')
df_under5['Age Group'] = 'Under 5'
df_under5.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_under5.columns.values]
df_under5.head()

Unnamed: 0,Entity,Code,Year,Invasive Non,Interpersonal violence,Nutritional deficiencies,Acute hepatitis,Neoplasms,Measles,Digestive diseases,...,Whooping cough,Diarrheal diseases,"Fire, heat, and hot substances",Road injuries,Tuberculosis,HIV/AIDS,Drowning,Malaria,Syphilis,Age Group
0,Afghanistan,AFG,1990,48,105,1779,718,431,8649,477,...,2455,3968,131,802,808,10,776,21,123,Under 5
1,Afghanistan,AFG,1991,55,130,1822,741,439,8669,495,...,2385,4650,129,781,800,12,748,41,132,Under 5
2,Afghanistan,AFG,1992,68,155,2069,836,486,8539,554,...,2370,5833,137,821,863,13,777,51,180,Under 5
3,Afghanistan,AFG,1993,78,178,2427,970,549,8949,630,...,2659,7800,155,923,979,16,872,24,239,Under 5
4,Afghanistan,AFG,1994,83,194,2649,1063,589,10642,681,...,3187,7894,170,1015,1064,19,961,52,259,Under 5


In [116]:
# Import input dataset, add Age Group column, and clean header names
df_5to14 = pd.read_csv('raw_data/causes-of-death-in-5-14-year-olds.csv')
df_5to14['Age Group'] = '5-14'
df_5to14.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_5to14.columns.values]
df_5to14.head()

Unnamed: 0,Entity,Code,Year,Self,Interpersonal violence,Drowning,Malaria,"Fire, heat, and hot substances",Neoplasms,Digestive diseases,...,Diabetes mellitus,Protein,Exposure to forces of nature,Environmental heat and cold exposure,Diarrheal diseases,Road injuries,Tuberculosis,HIV/AIDS,Acute hepatitis,Age Group
0,Afghanistan,AFG,1990,12,65,290,3,27,278,119,...,11,99,0,2,27,511,160,0,103,5-14
1,Afghanistan,AFG,1991,12,80,311,6,30,299,129,...,12,108,277,2,30,552,170,0,112,5-14
2,Afghanistan,AFG,1992,14,88,356,8,34,341,147,...,14,124,120,1,34,633,191,0,129,5-14
3,Afghanistan,AFG,1993,15,97,393,3,38,373,162,...,15,138,42,1,60,695,205,1,143,5-14
4,Afghanistan,AFG,1994,15,105,410,7,39,385,170,...,16,145,29,1,42,721,209,1,150,5-14


In [117]:
# Import input dataset, add Age Group column, and clean header names
df_15to49 = pd.read_csv('raw_data/causes-of-death-in-15-49-year-olds.csv')
df_15to49['Age Group'] = '15-49'
df_15to49.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_15to49.columns.values]
df_15to49.head()

Unnamed: 0,Entity,Code,Year,Self,Interpersonal violence,Drowning,Malaria,"Fire, heat, and hot substances",Neoplasms,Digestive diseases,...,Exposure to forces of nature,Environmental heat and cold exposure,Diarrheal diseases,Road injuries,Tuberculosis,HIV/AIDS,Alzheimer's disease and other dementias,Parkinson's disease,Acute hepatitis,Age Group
0,Afghanistan,AFG,1990,482,1126,234,46,86,2078,731,...,0,73,42,2033,1208,15,5,10,553,15-49
1,Afghanistan,AFG,1991,535,1510,262,94,94,2183,773,...,543,52,46,2317,1289,19,5,10,595,15-49
2,Afghanistan,AFG,1992,634,1782,311,125,109,2386,853,...,266,27,54,2814,1446,24,5,10,671,15-49
3,Afghanistan,AFG,1993,716,2038,350,58,121,2544,917,...,103,31,77,3204,1575,28,5,10,737,15-49
4,Afghanistan,AFG,1994,759,2260,365,110,126,2615,948,...,73,33,64,3385,1653,32,5,11,776,15-49


In [118]:
# Import input dataset, add Age Group column, and clean header names
df_50to69 = pd.read_csv('raw_data/causes-of-death-in-50-69-year-olds.csv')
df_50to69['Age Group'] = '50-69'
df_50to69.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_50to69.columns.values]
df_50to69.head()

Unnamed: 0,Entity,Code,Year,Self,Interpersonal violence,Cardiovascular diseases,Drug use disorders,Nutritional deficiencies,Alcohol use disorders,Lower respiratory infections,...,Acute hepatitis,Malaria,"Fire, heat, and hot substances",Neoplasms,Digestive diseases,Cirrhosis and other chronic liver diseases,Chronic respiratory diseases,Chronic kidney disease,Diabetes mellitus,Age Group
0,Afghanistan,AFG,1990,167,208,20666,23,49,28,932,...,1232,19,54,5446,1992,1233,2453,1534,1100,50-69
1,Afghanistan,AFG,1991,168,242,20840,24,50,28,941,...,1255,38,55,5493,2012,1247,2481,1527,1100,50-69
2,Afghanistan,AFG,1992,171,235,21174,25,52,28,956,...,1287,44,56,5577,2043,1268,2526,1521,1107,50-69
3,Afghanistan,AFG,1993,176,237,21699,26,55,29,979,...,1333,17,57,5679,2082,1293,2594,1529,1122,50-69
4,Afghanistan,AFG,1994,180,249,22273,28,58,29,1003,...,1385,33,58,5771,2121,1315,2672,1545,1140,50-69


In [119]:
# Import input dataset, add Age Group column, and clean header names
df_70 = pd.read_csv('raw_data/causes-of-death-in-70-year-olds.csv')
df_70['Age Group'] = '70'
df_70.columns = [col if len(col.split('-')) <= 1 else col.split('-')[1].strip() for col in df_70.columns.values]
df_70.head()

Unnamed: 0,Entity,Code,Year,Self,Interpersonal violence,Exposure to forces of nature,Drowning,Environmental heat and cold exposure,Diarrheal diseases,Road injuries,...,Alzheimer's disease and other dementias,Cardiovascular diseases,Nutritional deficiencies,Drug use disorders,Alcohol use disorders,Lower respiratory infections,Diabetes mellitus,Protein,Acute hepatitis,Age Group
0,Afghanistan,AFG,1990,35,33,0,14,20,140,169,...,973,18600,21,7,6,1559,730,16,378,70
1,Afghanistan,AFG,1991,35,39,101,14,12,142,173,...,991,18820,22,8,6,1576,729,16,389,70
2,Afghanistan,AFG,1992,36,38,40,14,2,142,177,...,1013,19109,23,8,6,1595,730,17,401,70
3,Afghanistan,AFG,1993,37,39,13,15,2,162,182,...,1036,19565,24,8,6,1628,736,18,419,70
4,Afghanistan,AFG,1994,38,41,9,15,2,151,187,...,1058,20084,25,9,6,1668,744,19,441,70


In [108]:
# Check data types
# cause_of_death_df.dtypes

In [109]:
# Correct data types
# cause_of_death_df['Age Group'] = pd.Series(df_under5['Age Group'], dtype='string')
# cause_of_death_df['Entity'] = pd.Series(df_under5['Entity'], dtype='string') 
# cause_of_death_df['Code'] = pd.Series(df_under5['Code'], dtype='string')  
# column_to_move = cause_of_death_df.pop('Age Group')
# cause_of_death_df.insert(3, 'Age Group', column_to_move)

In [110]:
# Move Age Group column to front of dataframe


In [111]:
# Upload dataframe to database

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Capstone_DB"
engine = create_engine(db_string)

df_under5.to_sql(name='cod_under5', con=engine, if_exists='replace')
df_5to14.to_sql(name='cod_5to14', con=engine, if_exists='replace')
df_15to49.to_sql(name='cod_15to49', con=engine, if_exists='replace')
df_50to69.to_sql(name='cod_50to69', con=engine, if_exists='replace')
df_70.to_sql(name='cod_70', con=engine, if_exists='replace')

health_data.to_sql(name='health_data', con=engine, if_exists='replace')
# cause_of_death_df.to_sql(name='cause_of_death', con=engine, if_exists='replace')
