In [100]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

# Combining the Datasets 

#### Proportion of population living below the national poverty line 

In [101]:
'''
STEPS:
-> change the filename into something shorter (if sdg, only the "number")
1. load the data to a variable
2. make the first row into the column header
3. drop the first row that was made into the column header + other irrelevant rows
4. reset the index
5. change the region names para same lahat ng datasets
6. make it into a long representation
7. change the column name to the name of the new column 
8. add it to the combined data set
'''
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.2.1.csv')
data

Unnamed: 0,1.2.1 Proportion of population living below the national poverty line by sex age 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
3,..National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
4,..Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
5,..Region I,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
6,..Region II,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
7,..Region III,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
8,..Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
9,..MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..


In [102]:
# NOTE: Before applying, make sure that the arrangement of the regions are the same as the arrangement in your table
region_names = ['PHILIPPINES', 'NCR: National Capital Region', 
                 'CAR: Cordillera Administrative Region', 
                 'Region 1: Ilocos Region', 
                 'Region 2: Cagayan Valley', 
                 'Region 3: Central Luzon', 
                 'Region 4A: CALABARZON', 
                'MIMAROPA: Southwestern Tagalog Region', 
                'Region 5: Bicol Region', 
                'Region 6: Western Visayas', 
                'Region 7: Central Visayas', 
                'Region 8: Eastern Visayas', 
                'Region 9: Zamboanga Peninsula', 
                'Region 10: Northern Mindanao', 
                'Region 11: Davao Region', 
                'Region 12: SOCCSKSARGEN', 
                'CARAGA: Cordillera Administrative Region', 
                'BARMM: Bangsamoro Autonomous Region in Muslim Mindanao']

In [103]:
# setting our column names, as we can see from above, the headers can be seen at the 0th index
data.columns = data.iloc [0] 

# dropping the 'geolocation' row as that is actually used as a header
data = data.drop (data.index [1])

# dropping the column names 
data = data.drop (data.index [0])

In [104]:
data.reset_index (drop=True, inplace=True)

# renames the column 'Year' as its actually the location column
data.rename(columns = {'Year':'Geolocation'}, inplace=True)
data = data.drop (data.index [18:]) # irrelevant rows 
# renames the data in the Geolocation for consistency
data['Geolocation'] = region_names
data.set_index('Geolocation')
data = data.reset_index(drop=True)
data

Unnamed: 0,Geolocation,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
0,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
1,NCR: National Capital Region,..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
2,CAR: Cordillera Administrative Region,..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
3,Region 1: Ilocos Region,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
4,Region 2: Cagayan Valley,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
5,Region 3: Central Luzon,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
6,Region 4A: CALABARZON,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
7,MIMAROPA: Southwestern Tagalog Region,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..
8,Region 5: Bicol Region,..,..,..,..,..,..,..,..,..,...,..,..,39.8,..,..,27.0,..,..,...,..
9,Region 6: Western Visayas,..,..,..,..,..,..,..,..,..,...,..,..,24.6,..,..,16.3,..,..,...,..


In [105]:
for c in data.columns.difference(['Geolocation']):
    # cells without values are represented as either '..' or '...', so we should convert them to NaN so we could dropna()
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# drops columns if all of the values are NaN
# data = data.dropna(axis=1)

In [106]:
# converting from a wide representation to a long representation
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

# renaming the columns into a more readable anmes
data.rename(columns = {'value':'1.2.1', 0 : 'Year'}, inplace=True)

# making the year type into integer
data = data.astype({'Year':'int'})

In [107]:
data

Unnamed: 0,Geolocation,Year,1.2.1
0,PHILIPPINES,2001,
1,NCR: National Capital Region,2001,
2,CAR: Cordillera Administrative Region,2001,
3,Region 1: Ilocos Region,2001,
4,Region 2: Cagayan Valley,2001,
...,...,...,...
391,Region 10: Northern Mindanao,2022,
392,Region 11: Davao Region,2022,
393,Region 12: SOCCSKSARGEN,2022,
394,CARAGA: Cordillera Administrative Region,2022,


In [108]:
combined_data = data

#### Net Enrolment Rate in elementary

In [109]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.4.1p5.csv')
data

Unnamed: 0,1.4.1p5 Net Enrolment Rate in elementary (Indicator is also found in SDG 4.3.s1) 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,Year,2000,2001,2002.00,2003.00,2004.00,2005.00,2006.00,2007.00,...,2013.00,2014.00,2015.00,2016.00,2017.00,2018.00,2019.00,2020.0000,2021,2022
1,Geolocation,Sex,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,Both Sexes,96.77,90.1,90.29,88.74,87.11,84.44,83.22,84.93,...,97.20,97.19,96.90,96.15,94.19,94.05,93.96,89.1064,...,...
3,,Boys,96.27,89.33,89.51,87.84,86.17,83.56,82.39,84.07,...,96.74,96.87,96.66,96.17,94.12,94.25,93.79,88.9318,...,...
4,,Girls,97.28,90.91,91.10,89.68,88.08,85.35,84.08,85.83,...,97.68,97.53,97.15,96.12,94.27,93.85,94.15,89.2898,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Note:,,,,,,,,,,...,,,,,,,,,,
59,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
60,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
61,1/ - Updates were based on the submission of D...,,,,,,,,,,...,,,,,,,,,,


In [110]:
data = data.drop (data.index [56:]) 

In [111]:
# setting the column names and removing the row that held the previous column names
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [112]:
data = data.rename(columns = {np.nan:'Geolocation', 'Year': 'Sex'})
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [113]:
# Only getting the total data, then dropping Sex column as it's not needed anymore
data = data[data['Sex'] == 'Both Sexes']
data = data.drop("Sex", axis = 1)
data = data.reset_index (drop=True)

In [114]:
data['Geolocation'] = region_names

In [115]:
for c in data.columns.difference(['Geolocation']):
    # cells without values are represented as either '..' or '...', so we should convert them to NaN so we could dropna()
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [116]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'1.4.1p5', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [117]:
data

Unnamed: 0,Geolocation,Year,1.4.1p5
0,PHILIPPINES,2001,90.1
1,NCR: National Capital Region,2001,97.82
2,CAR: Cordillera Administrative Region,2001,92.89
3,Region 1: Ilocos Region,2001,91.33
4,Region 2: Cagayan Valley,2001,89.45
...,...,...,...
391,Region 10: Northern Mindanao,2022,
392,Region 11: Davao Region,2022,
393,Region 12: SOCCSKSARGEN,2022,
394,CARAGA: Cordillera Administrative Region,2022,


In [118]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [119]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5
0,PHILIPPINES,2001,,90.1
1,NCR: National Capital Region,2001,,97.82
2,CAR: Cordillera Administrative Region,2001,,92.89
3,Region 1: Ilocos Region,2001,,91.33
4,Region 2: Cagayan Valley,2001,,89.45
...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,
392,Region 11: Davao Region,2022,,
393,Region 12: SOCCSKSARGEN,2022,,
394,CARAGA: Cordillera Administrative Region,2022,,


#### Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2)

In [120]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.4.1p6.csv')
data

Unnamed: 0,1.4.1p6 Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,,,Year,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016.00,2017.00,2018.00,2019.00,2020.0000,2021,2022
1,Level of Education,Geolocation,Sex,,,,,,,,...,,,,,,,,,,
2,Junior High School,PHILIPPINES,Both Sexes,66.06,57.55,59,60.15,59.97,58.54,58.59,...,67.89,67.19,73.57,74.19,75.99,81.41,82.89,81.4869,...,...
3,,,Boys,62.72,52.96,54.39,55.34,55.04,53.65,53.85,...,62.42,61.68,68.09,68.79,70.88,77.24,78.80,77.6557,...,...
4,,,Girls,69.49,62.24,63.72,65.07,65.01,63.53,63.44,...,73.69,73.05,79.42,79.94,81.42,85.82,87.20,85.5003,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
113,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
114,1/ - Updates were based on submission of DepEd...,,,,,,,,,,...,,,,,,,,,,
115,2/ - Estimation of this sub-indicator only sta...,,,,,,,,,,...,,,,,,,,,,


In [121]:
data = data.drop (data.index [110:]) 

In [122]:
data.at[0, '1.4.1p6 Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2)'] = 'Level of Education'
data.at[0, 'Unnamed: 1'] = 'Geolocation'
data.at[0, 'Unnamed: 2'] = 'Sex'

In [123]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [124]:
senior_high_data = data [54:]
junior_high_data = data [:54]

In [125]:
junior_high_data = junior_high_data [junior_high_data['Sex'] == 'Both Sexes']
junior_high_data = junior_high_data.reset_index (drop=True)

In [126]:
junior_high_data = junior_high_data.drop("Level of Education", axis = 1)
junior_high_data = junior_high_data.drop("Sex", axis = 1)
junior_high_data = junior_high_data.reset_index (drop=True)

In [127]:
junior_high_data['Geolocation'] = region_names

In [128]:
for c in junior_high_data.columns.difference(['Geolocation']):
    junior_high_data [c].replace(to_replace='..', value= np.nan, inplace= True)
    junior_high_data [c].replace(to_replace='...', value= np.nan, inplace= True)

In [129]:
junior_high_data = pd.melt(junior_high_data, id_vars='Geolocation', value_vars=junior_high_data.columns [2:]) 

junior_high_data.rename(columns = {'value':'1.4.1p6 (Junior High School)', 0 : 'Year'}, inplace=True)
junior_high_data = junior_high_data.astype({'Year':'int'})

In [130]:
senior_high_data = senior_high_data [senior_high_data['Sex'] == 'Both Sexes']
senior_high_data = senior_high_data.reset_index (drop=True)

In [131]:
senior_high_data = senior_high_data.drop("Level of Education", axis = 1)
senior_high_data = senior_high_data.drop("Sex", axis = 1)
senior_high_data = senior_high_data.reset_index (drop=True)

In [132]:
senior_high_data['Geolocation'] = region_names

In [133]:
for c in senior_high_data.columns.difference(['Geolocation']):
    senior_high_data [c].replace(to_replace='..', value= np.nan, inplace= True)
    senior_high_data [c].replace(to_replace='...', value= np.nan, inplace= True)

In [134]:
senior_high_data = pd.melt(senior_high_data, id_vars='Geolocation', value_vars=senior_high_data.columns [2:]) 

senior_high_data.rename(columns = {'value':'1.4.1p6 (Senior High School)', 0 : 'Year'}, inplace=True)
senior_high_data = senior_high_data.astype({'Year':'int'})

In [135]:
combined_data = combined_data.merge(junior_high_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.merge(senior_high_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [136]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School)
0,PHILIPPINES,2001,,90.1,57.55,
1,NCR: National Capital Region,2001,,97.82,67.84,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,
3,Region 1: Ilocos Region,2001,,91.33,68.21,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,
...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,
392,Region 11: Davao Region,2022,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,


#### Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies

In [137]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.5.4.csv')
data

Unnamed: 0,1.5.4 Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies (Indicator can also found in SDG 13.1.3 and 11.b.2),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016.0,2017,2018.0,2019,2020.0,2021.0,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,..,52.9,..,76.5,..,82.4,100.0,...
3,Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,..,94.0,..,97.5,..,79.5,61.5,...
4,Region I,..,..,..,..,..,..,..,..,..,...,..,..,..,44.8,..,100.0,..,74.4,76.7,...
5,Region II,..,..,..,..,..,..,..,..,..,...,..,..,..,100.0,..,100.0,..,49.0,55.1,...
6,Region III,..,..,..,..,..,..,..,..,..,...,..,..,..,59.0,..,99.3,..,100.0,100.0,...
7,Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,..,99.8,..,100.0,..,100.0,74.8,...
8,MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,..,82.0,..,100.0,..,100.0,100.0,...
9,Region V,..,..,..,..,..,..,..,..,..,...,..,..,..,91.0,..,93.3,..,57.5,56.7,...


In [138]:
data = data.drop (data.index [19:])

In [139]:
data.at[0, '1.5.4 Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies (Indicator can also found in SDG 13.1.3 and 11.b.2)'] = 'Geolocation'

In [140]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [141]:
data ['Geolocation'] = region_names [1:]

In [143]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [144]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'1.5.4', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [146]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [150]:
combined_data[combined_data['1.5.4'] != np.nan]

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4
0,PHILIPPINES,2001,,90.1,57.55,,
1,NCR: National Capital Region,2001,,97.82,67.84,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,
...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,
392,Region 11: Davao Region,2022,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,


#### Mortality rate attributed to cardiovascular disease, cancer, diabetes or chronic respiratory disease