# Progress of the Philippines' Sustainable Development Goals

### Import

In [40]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

## Data Collection
The following **csv** files used in this project are acquired through a request sent to the Knowledge Management and Communications Division of the Philippine Statistics Authority.

### Combining the Datasets 
In this stage, the separate datasets underwent pre-processing and cleaning before they are combined together. Some of the cleaning done on each of the datasets are: (1) fixing of column names, (2) modification of the values of the 'Geolocation' column, (3) removal of unneeded rows and columns, and (4) conversion of '..' or '...' values to NaN. After this, the dataset is converted into a long representation before they are merged together.

#### 1.2.1. Proportion of population living below the national poverty line 
To start with, let us load the data from the csv file using pandas' [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) function.

In [41]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.2.1.csv')
data

Unnamed: 0,1.2.1 Proportion of population living below the national poverty line by sex age 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
3,..National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
4,..Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
5,..Region I,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
6,..Region II,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
7,..Region III,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
8,..Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
9,..MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..


Looking at the DataFrame, we could see that the columns are unnamed and that the column names are located at the 0th row. Using [`iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html), we could get the 0th row and then assign it as the column values. 

Then, using the [`drop`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html) function, we can drop the 0th row as we have no need for it anymore. Additionally, since the row at index 1 is a row full of NaN, we can also drop it using the same function. 

To be able to fix the indexing of the rows, the [`reset_index`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html) function was used to reset the index from 0.

In [42]:
# setting our column names
data.columns = data.iloc [0] 

# dropping the 'geolocation' row as that is actually used as a header
data = data.drop (data.index [1])

# dropping the column names 
data = data.drop (data.index [0])

data.reset_index (drop=True, inplace=True)

Irrelevant rows that are just footers for the file are also removed.

In [43]:
# dropping irrelevant rows 
data = data.drop (data.index [18:]) 

The `Year` column must also be renamed into `Geolocation` as this row refers to the different regions in the Philippines, and not the years. This can be done through the use of the of the [`rename`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html) function.

In [44]:
# renames the column 'Year' as its actually the location column
data.rename(columns = {'Year':'Geolocation'}, inplace=True)

To easily determine which region the `Geolocation` values refer to, we can also change these values to include the names that they are commonly referred to, instead of just their region numbers. 

For consistency throughout the different datasets, the `region_names` variable was declared.

In [45]:
# NOTE: Before applying, make sure that the arrangement of the regions are the same as the arrangement in your table
region_names = ['PHILIPPINES', 'NCR: National Capital Region', 
                 'CAR: Cordillera Administrative Region', 
                 'Region 1: Ilocos Region', 
                 'Region 2: Cagayan Valley', 
                 'Region 3: Central Luzon', 
                 'Region 4A: CALABARZON', 
                'MIMAROPA: Southwestern Tagalog Region', 
                'Region 5: Bicol Region', 
                'Region 6: Western Visayas', 
                'Region 7: Central Visayas', 
                'Region 8: Eastern Visayas', 
                'Region 9: Zamboanga Peninsula', 
                'Region 10: Northern Mindanao', 
                'Region 11: Davao Region', 
                'Region 12: SOCCSKSARGEN', 
                'CARAGA: Cordillera Administrative Region', 
                'BARMM: Bangsamoro Autonomous Region in Muslim Mindanao']

In [46]:
# renames the data in the Geolocation for consistency
data['Geolocation'] = region_names
data.set_index('Geolocation')
data = data.reset_index(drop=True)
data

Unnamed: 0,Geolocation,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
0,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
1,NCR: National Capital Region,..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
2,CAR: Cordillera Administrative Region,..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
3,Region 1: Ilocos Region,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
4,Region 2: Cagayan Valley,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
5,Region 3: Central Luzon,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
6,Region 4A: CALABARZON,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
7,MIMAROPA: Southwestern Tagalog Region,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..
8,Region 5: Bicol Region,..,..,..,..,..,..,..,..,..,...,..,..,39.8,..,..,27.0,..,..,...,..
9,Region 6: Western Visayas,..,..,..,..,..,..,..,..,..,...,..,..,24.6,..,..,16.3,..,..,...,..


Next, we can convert the strings of '..' and '...', which were used to represent that there were no values for these cells, to **NaN**, through the use of the [`replace`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html) function.

However, the columns that have all **NaN** values were not dropped because if this dataset would be combined with other datasets, all years would still be present as there are datasets with complete data for all the years. 

In [47]:
for c in data.columns.difference(['Geolocation']):
    # cells without values are represented as either '..' or '...', so we should convert them to NaN so we could dropna()
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# drops columns if all of the values are NaN
# data = data.dropna(axis=1)

In [48]:
data

Unnamed: 0,Geolocation,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
0,PHILIPPINES,,,,,,,,,,...,,,23.5,,,16.7,,,,
1,NCR: National Capital Region,,,,,,,,,,...,,,4.1,,,2.2,,,,
2,CAR: Cordillera Administrative Region,,,,,,,,,,...,,,22.7,,,12.0,,,,
3,Region 1: Ilocos Region,,,,,,,,,,...,,,18.8,,,9.9,,,,
4,Region 2: Cagayan Valley,,,,,,,,,,...,,,17.8,,,16.3,,,,
5,Region 3: Central Luzon,,,,,,,,,,...,,,10.5,,,7.0,,,,
6,Region 4A: CALABARZON,,,,,,,,,,...,,,12.5,,,7.1,,,,
7,MIMAROPA: Southwestern Tagalog Region,,,,,,,,,,...,,,25.2,,,15.1,,,,
8,Region 5: Bicol Region,,,,,,,,,,...,,,39.8,,,27.0,,,,
9,Region 6: Western Visayas,,,,,,,,,,...,,,24.6,,,16.3,,,,


As the final step, the wide representation of this dataset is converted to a long representation through the use of the [`melt`](https://pandas.pydata.org/docs/reference/api/pandas.melt.html) function. 

Then, the column that holds the value for a specific year and region is coverted, using [`rename`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html), to the ID of this Sustainable Development Goal (SDG), so that it can be distinguished when it is combined with other datasets.

In [49]:
# converting from a wide representation to a long representation
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

# renaming the columns into a more readable anmes
data.rename(columns = {'value':'1.2.1', 0 : 'Year'}, inplace=True)

# making the year type into integer
data = data.astype({'Year':'int'})

data

Unnamed: 0,Geolocation,Year,1.2.1
0,PHILIPPINES,2001,
1,NCR: National Capital Region,2001,
2,CAR: Cordillera Administrative Region,2001,
3,Region 1: Ilocos Region,2001,
4,Region 2: Cagayan Valley,2001,
...,...,...,...
391,Region 10: Northern Mindanao,2022,
392,Region 11: Davao Region,2022,
393,Region 12: SOCCSKSARGEN,2022,
394,CARAGA: Cordillera Administrative Region,2022,


As this is the first dataset, we can just assign it to the `combined_data` DataFrame, which would hold the combined datasets.

In [50]:
combined_data = data

#### Net Enrolment Rate in elementary

In [10]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.4.1p5.csv')
data

Unnamed: 0,1.4.1p5 Net Enrolment Rate in elementary (Indicator is also found in SDG 4.3.s1) 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,Year,2000,2001,2002.00,2003.00,2004.00,2005.00,2006.00,2007.00,...,2013.00,2014.00,2015.00,2016.00,2017.00,2018.00,2019.00,2020.0000,2021,2022
1,Geolocation,Sex,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,Both Sexes,96.77,90.1,90.29,88.74,87.11,84.44,83.22,84.93,...,97.20,97.19,96.90,96.15,94.19,94.05,93.96,89.1064,...,...
3,,Boys,96.27,89.33,89.51,87.84,86.17,83.56,82.39,84.07,...,96.74,96.87,96.66,96.17,94.12,94.25,93.79,88.9318,...,...
4,,Girls,97.28,90.91,91.10,89.68,88.08,85.35,84.08,85.83,...,97.68,97.53,97.15,96.12,94.27,93.85,94.15,89.2898,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Note:,,,,,,,,,,...,,,,,,,,,,
59,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
60,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
61,1/ - Updates were based on the submission of D...,,,,,,,,,,...,,,,,,,,,,


In [11]:
data = data.drop (data.index [56:]) 

In [12]:
# setting the column names and removing the row that held the previous column names
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [13]:
data = data.rename(columns = {np.nan:'Geolocation', 'Year': 'Sex'})
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [14]:
# Only getting the total data, then dropping Sex column as it's not needed anymore
data = data[data['Sex'] == 'Both Sexes']
data = data.drop("Sex", axis = 1)
data = data.reset_index (drop=True)

In [15]:
data['Geolocation'] = region_names

In [16]:
for c in data.columns.difference(['Geolocation']):
    # cells without values are represented as either '..' or '...', so we should convert them to NaN so we could dropna()
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [17]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'1.4.1p5', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [18]:
data

Unnamed: 0,Geolocation,Year,1.4.1p5
0,PHILIPPINES,2001,90.1
1,NCR: National Capital Region,2001,97.82
2,CAR: Cordillera Administrative Region,2001,92.89
3,Region 1: Ilocos Region,2001,91.33
4,Region 2: Cagayan Valley,2001,89.45
...,...,...,...
391,Region 10: Northern Mindanao,2022,
392,Region 11: Davao Region,2022,
393,Region 12: SOCCSKSARGEN,2022,
394,CARAGA: Cordillera Administrative Region,2022,


In [19]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [20]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5
0,PHILIPPINES,2001,,90.1
1,NCR: National Capital Region,2001,,97.82
2,CAR: Cordillera Administrative Region,2001,,92.89
3,Region 1: Ilocos Region,2001,,91.33
4,Region 2: Cagayan Valley,2001,,89.45
...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,
392,Region 11: Davao Region,2022,,
393,Region 12: SOCCSKSARGEN,2022,,
394,CARAGA: Cordillera Administrative Region,2022,,


#### Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2)

In [21]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.4.1p6.csv')
data

Unnamed: 0,1.4.1p6 Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,,,Year,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016.00,2017.00,2018.00,2019.00,2020.0000,2021,2022
1,Level of Education,Geolocation,Sex,,,,,,,,...,,,,,,,,,,
2,Junior High School,PHILIPPINES,Both Sexes,66.06,57.55,59,60.15,59.97,58.54,58.59,...,67.89,67.19,73.57,74.19,75.99,81.41,82.89,81.4869,...,...
3,,,Boys,62.72,52.96,54.39,55.34,55.04,53.65,53.85,...,62.42,61.68,68.09,68.79,70.88,77.24,78.80,77.6557,...,...
4,,,Girls,69.49,62.24,63.72,65.07,65.01,63.53,63.44,...,73.69,73.05,79.42,79.94,81.42,85.82,87.20,85.5003,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
113,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
114,1/ - Updates were based on submission of DepEd...,,,,,,,,,,...,,,,,,,,,,
115,2/ - Estimation of this sub-indicator only sta...,,,,,,,,,,...,,,,,,,,,,


In [22]:
data = data.drop (data.index [110:]) 

In [23]:
data.at[0, '1.4.1p6 Net Enrolment Rate in secondary education (Indicator is also found in SDG 4.3.s2)'] = 'Level of Education'
data.at[0, 'Unnamed: 1'] = 'Geolocation'
data.at[0, 'Unnamed: 2'] = 'Sex'

In [24]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [25]:
senior_high_data = data [54:]
junior_high_data = data [:54]

In [26]:
junior_high_data = junior_high_data [junior_high_data['Sex'] == 'Both Sexes']
junior_high_data = junior_high_data.reset_index (drop=True)

In [27]:
junior_high_data = junior_high_data.drop("Level of Education", axis = 1)
junior_high_data = junior_high_data.drop("Sex", axis = 1)
junior_high_data = junior_high_data.reset_index (drop=True)

In [28]:
junior_high_data['Geolocation'] = region_names

In [29]:
for c in junior_high_data.columns.difference(['Geolocation']):
    junior_high_data [c].replace(to_replace='..', value= np.nan, inplace= True)
    junior_high_data [c].replace(to_replace='...', value= np.nan, inplace= True)

In [30]:
junior_high_data = pd.melt(junior_high_data, id_vars='Geolocation', value_vars=junior_high_data.columns [2:]) 

junior_high_data.rename(columns = {'value':'1.4.1p6 (Junior High School)', 0 : 'Year'}, inplace=True)
junior_high_data = junior_high_data.astype({'Year':'int'})

In [31]:
senior_high_data = senior_high_data [senior_high_data['Sex'] == 'Both Sexes']
senior_high_data = senior_high_data.reset_index (drop=True)

In [32]:
senior_high_data = senior_high_data.drop("Level of Education", axis = 1)
senior_high_data = senior_high_data.drop("Sex", axis = 1)
senior_high_data = senior_high_data.reset_index (drop=True)

In [33]:
senior_high_data['Geolocation'] = region_names

In [34]:
for c in senior_high_data.columns.difference(['Geolocation']):
    senior_high_data [c].replace(to_replace='..', value= np.nan, inplace= True)
    senior_high_data [c].replace(to_replace='...', value= np.nan, inplace= True)

In [35]:
senior_high_data = pd.melt(senior_high_data, id_vars='Geolocation', value_vars=senior_high_data.columns [2:]) 

senior_high_data.rename(columns = {'value':'1.4.1p6 (Senior High School)', 0 : 'Year'}, inplace=True)
senior_high_data = senior_high_data.astype({'Year':'int'})

In [36]:
combined_data = combined_data.merge(junior_high_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.merge(senior_high_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [37]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School)
0,PHILIPPINES,2001,,90.1,57.55,
1,NCR: National Capital Region,2001,,97.82,67.84,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,
3,Region 1: Ilocos Region,2001,,91.33,68.21,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,
...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,
392,Region 11: Davao Region,2022,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,


#### Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies

In [38]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.5.4.csv')
data

Unnamed: 0,1.5.4 Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies (Indicator can also found in SDG 13.1.3 and 11.b.2),Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016.0,2017,2018.0,2019,2020.0,2021.0,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,..,52.9,..,76.5,..,82.4,100.0,...
3,Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,..,94.0,..,97.5,..,79.5,61.5,...
4,Region I,..,..,..,..,..,..,..,..,..,...,..,..,..,44.8,..,100.0,..,74.4,76.7,...
5,Region II,..,..,..,..,..,..,..,..,..,...,..,..,..,100.0,..,100.0,..,49.0,55.1,...
6,Region III,..,..,..,..,..,..,..,..,..,...,..,..,..,59.0,..,99.3,..,100.0,100.0,...
7,Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,..,99.8,..,100.0,..,100.0,74.8,...
8,MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,..,82.0,..,100.0,..,100.0,100.0,...
9,Region V,..,..,..,..,..,..,..,..,..,...,..,..,..,91.0,..,93.3,..,57.5,56.7,...


In [39]:
data = data.drop (data.index [19:])

In [40]:
data.at[0, '1.5.4 Proportion of local governments that adopt and implement local disaster risk reduction strategies in line with national disaster risk reduction strategies (Indicator can also found in SDG 13.1.3 and 11.b.2)'] = 'Geolocation'

In [41]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [42]:
data ['Geolocation'] = region_names [1:]

In [43]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [44]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'1.5.4', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [45]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [46]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4
0,PHILIPPINES,2001,,90.1,57.55,,
1,NCR: National Capital Region,2001,,97.82,67.84,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,
...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,
392,Region 11: Davao Region,2022,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,


#### Mortality rate attributed to cardiovascular disease, cancer, diabetes or chronic respiratory disease

In [47]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/3.4.1.csv')
data

Unnamed: 0,"3.4.1 Mortality rate attributed to cardiovascular disease, cancer, diabetes or chronic respiratory disease",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,,Year,,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020.0,2021,2022
1,Indicator,Geolocation,,,,,,,,,...,,,,,,,,,,
2,3.4.1 Mortality rate attributed to cardiovascu...,PHILIPPINES,Both Sexes,..,..,..,..,..,..,4.2,...,4.5,4.6,4.7,4.6,4.5,4.5,4.7,4.6,..,...
3,,,Male,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,5.6,..,...
4,,,Female,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,3.7,..,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,,,,,,,,,,,...,,,,,,,,,,
268,,,,,,,,,,,...,,,,,,,,,,
269,Note:,,,,,,,,,,...,,,,,,,,,,
270,.. - Data not available,,,,,,,,,,...,,,,,,,,,,


In [48]:
data = data.drop (data.index [266:])

In [49]:
data.at[0, '3.4.1 Mortality rate attributed to cardiovascular disease, cancer, diabetes or chronic respiratory disease'] = 'Indicator'
data.at[0, 'Unnamed: 1'] = 'Geolocation'
data.at[0, 'Unnamed: 2'] = 'Sex'

In [50]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [51]:
data = data [data ['Sex'] == 'Both Sexes']
data = data.reset_index(drop=True)

In [52]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [53]:
all_data = data [0:16]
cardio_data = data [16:34]
cancer_data = data [34:52]
diabetes_data = data [52:70]
respi_data = data [70:]

In [54]:
all_data = all_data.drop('Indicator', axis = 1)
all_data = all_data.drop('Sex', axis = 1)

In [55]:
# no region five and six
all_data ['Geolocation'] = region_names [0:8] + region_names [10:]

In [56]:
all_data = pd.melt(all_data, id_vars='Geolocation', value_vars=all_data.columns [2:]) 

all_data.rename(columns = {'value':'3.4.1 (Total data)', 0 : 'Year'}, inplace=True)
all_data = all_data.astype({'Year':'int'})

In [57]:
combined_data = combined_data.merge(all_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [58]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4,3.4.1 (Total data)
0,PHILIPPINES,2001,,90.1,57.55,,,
1,NCR: National Capital Region,2001,,97.82,67.84,,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,,
...,...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,,
392,Region 11: Davao Region,2022,,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,,


#### Proportion of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied [provided] with modern methods

In [59]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/3.7.1.csv')
data

Unnamed: 0,3.7.1 Proportion of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied [provided] with modern methods,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,Year,,2000,2001,2002,2003.0,2004,2005,2006,2007,...,2013.0,2014,2015,2016,2017.0,2018,2019,2020,2021,2022
1,Indicator/Sub-indicators,Geolocation,,,,,,,,,...,,,,,,,,,,
2,3.7.1 Proportion of women of reproductive age ...,PHILIPPINES,..,..,..,46.7,..,..,..,..,...,51.8,..,..,..,56.9,..,..,..,..,...
3,,..National Capital Region (NCR),..,..,..,47.2,..,..,..,..,...,53.4,..,..,..,59.5,..,..,..,..,...
4,,..Cordillera Administrative Region (CAR),..,..,..,44.4,..,..,..,..,...,59.8,..,..,..,66.7,..,..,..,..,...
5,,..Region I,..,..,..,49.6,..,..,..,..,...,50.8,..,..,..,59.5,..,..,..,..,...
6,,..Region II,..,..,..,68.8,..,..,..,..,...,69.1,..,..,..,74.1,..,..,..,..,...
7,,..Region III,..,..,..,54.2,..,..,..,..,...,60.4,..,..,..,56.8,..,..,..,..,...
8,,..Region IV-A,..,..,..,46.1,..,..,..,..,...,49.1,..,..,..,49.2,..,..,..,..,...
9,,..MIMAROPA,..,..,..,48.5,..,..,..,..,...,55.1,..,..,..,61.7,..,..,..,..,...


In [60]:
data = data.drop (data.index [20:])

In [61]:
data.at[0, 'Unnamed: 1'] = 'Geolocation'

In [62]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [63]:
data = data.drop('Year', axis=1)

In [64]:
data ['Geolocation'] = region_names

In [65]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [66]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'3.7.1', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [67]:
data

Unnamed: 0,Geolocation,Year,3.7.1
0,PHILIPPINES,2001,
1,NCR: National Capital Region,2001,
2,CAR: Cordillera Administrative Region,2001,
3,Region 1: Ilocos Region,2001,
4,Region 2: Cagayan Valley,2001,
...,...,...,...
391,Region 10: Northern Mindanao,2022,
392,Region 11: Davao Region,2022,
393,Region 12: SOCCSKSARGEN,2022,
394,CARAGA: Cordillera Administrative Region,2022,


In [68]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [69]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4,3.4.1 (Total data),3.7.1
0,PHILIPPINES,2001,,90.1,57.55,,,,
1,NCR: National Capital Region,2001,,97.82,67.84,,,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,,,
...,...,...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,,,
392,Region 11: Davao Region,2022,,,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,,,


#### Adolescent birth rate aged 15-19 years per 1,000 women in that age group

In [70]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/3.7.2.csv')
data

Unnamed: 0,"3.7.2 Adolescent birth rate aged 15-19 years per 1,000 women in that age group",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003.0,2004,2005,2006,2007,2008.0,...,2013.0,2014,2015,2016,2017.0,2018,2019,2020,2021,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,..,..,..,53.0,..,..,..,..,54.0,...,57.0,..,..,..,47.0,..,..,..,..,...
3,..National Capital Region (NCR),..,..,..,35.0,..,..,..,..,25.0,...,48.0,..,..,..,27.0,..,..,..,..,...
4,..Cordillera Administrative Region (CAR),..,..,..,52.0,..,..,..,..,34.0,...,53.0,..,..,..,25.0,..,..,..,..,...
5,..Region I,..,..,..,55.0,..,..,..,..,52.0,...,78.0,..,..,..,46.0,..,..,..,..,...
6,..Region II,..,..,..,85.0,..,..,..,..,54.0,...,65.0,..,..,..,51.0,..,..,..,..,...
7,..Region III,..,..,..,42.0,..,..,..,..,69.0,...,63.0,..,..,..,61.0,..,..,..,..,...
8,..Region IV-A,..,..,..,44.0,..,..,..,..,63.0,...,58.0,..,..,..,37.0,..,..,..,..,...
9,..MIMAROPA,..,..,..,108.0,..,..,..,..,87.0,...,68.0,..,..,..,47.0,..,..,..,..,...


In [71]:
data = data.drop (data.index [20:])

In [72]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [73]:
data.rename(columns = {'Year':'Geolocation'}, inplace=True)

In [74]:
data ['Geolocation'] = region_names

In [75]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [76]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'3.7.2', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [77]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [78]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4,3.4.1 (Total data),3.7.1,3.7.2
0,PHILIPPINES,2001,,90.1,57.55,,,,,
1,NCR: National Capital Region,2001,,97.82,67.84,,,,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,,,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,,,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,,,,
...,...,...,...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,,,,
392,Region 11: Davao Region,2022,,,,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,,,,


#### Completion Rate of elementary and secondary students

In [79]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/4.1.s1.csv')
data

Unnamed: 0,4.1.s1 Completion Rate of elementary and secondary students 1/ 2/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,Year,,,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018.00,2019.00,2020.000000,2021,2022
1,Geolocation,Level of Education,Sex,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,Elementary,Both Sexes,62.72,68.18,71.55,70.24,69.06,68.11,71.72,...,77.67,83.74,84.02,93.06,92.41,97.15,96.56,82.510000,...,...
3,,,Female,65.53,70.7,76.32,75.63,75.2,73.46,76.7,...,81.33,86.23,87.43,95.52,94.61,99.12,98.08,84.681828,...,...
4,,,Male,60.05,65.78,67.23,65.42,63.63,63.29,67.28,...,74.38,81.45,80.97,90.83,90.41,95.26,95.10,80.500538,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
167,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
168,1/ - Updates were based on the submission of D...,,,,,,,,,,...,,,,,,,,,,
169,2/ - Estimation in Senior High School only sta...,,,,,,,,,,...,,,,,,,,,,


In [80]:
data = data.drop(data.index[164:])

In [81]:
data.at[0, '4.1.s1 Completion Rate of elementary and secondary students 1/ 2/'] = 'Geolocation'
data.at[0, 'Unnamed: 1'] = 'Level of Education'
data.at[0, 'Unnamed: 2'] = 'Sex'

In [82]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [83]:
data = data [data['Sex'] == 'Both Sexes']
data = data.drop ('Sex', axis = 1)
data = data.reset_index(drop=True)

In [84]:
# copying the geolocation value to the next two rows
i = 0
while i < len (data):
    if i % 3 == 0:
        data.at[i + 1, 'Geolocation'] = data['Geolocation'][i]
        data.at[i + 2, 'Geolocation'] = data['Geolocation'][i]
        i = i + 3

In [85]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [86]:
elem_data = data [data['Level of Education'] == 'Elementary']
elem_data = elem_data.reset_index (drop=True)

junior_data = data [data['Level of Education'] == 'Secondary (Junior High School)']
junior_data = junior_data.reset_index (drop=True)

senior_data = data [data['Level of Education'] == 'Secondary (Senior High School)']
senior_data = senior_data.reset_index (drop=True)

In [87]:
elem_data = elem_data.drop ('Level of Education', axis = 1)
elem_data = elem_data.reset_index(drop=True)

In [88]:
elem_data ['Geolocation'] = region_names

In [89]:
elem_data = pd.melt(elem_data, id_vars='Geolocation', value_vars=elem_data.columns [2:]) 

elem_data.rename(columns = {'value':'4.1.s1 (Elementary)', 0 : 'Year'}, inplace=True)
elem_data = elem_data.astype({'Year':'int'})

In [90]:
combined_data = combined_data.merge(elem_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [91]:
junior_data = junior_data.drop ('Level of Education', axis = 1)
junior_data = junior_data.reset_index(drop=True)

In [92]:
junior_data ['Geolocation'] = region_names

In [93]:
junior_data = pd.melt(junior_data, id_vars='Geolocation', value_vars=junior_data.columns [2:]) 

junior_data.rename(columns = {'value':'4.1.s1 (Junior High School)', 0 : 'Year'}, inplace=True)
junior_data = junior_data.astype({'Year':'int'})

In [94]:
combined_data = combined_data.merge(junior_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [95]:
senior_data = senior_data.drop ('Level of Education', axis = 1)
senior_data = senior_data.reset_index(drop=True)

In [96]:
senior_data ['Geolocation'] = region_names

In [97]:
senior_data = pd.melt(senior_data, id_vars='Geolocation', value_vars=senior_data.columns [2:]) 

senior_data.rename(columns = {'value':'4.1.s1 (Senior High School)', 0 : 'Year'}, inplace=True)
senior_data = senior_data.astype({'Year':'int'})

In [98]:
combined_data = combined_data.merge(senior_data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [99]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4,3.4.1 (Total data),3.7.1,3.7.2,4.1.s1 (Elementary),4.1.s1 (Junior High School),4.1.s1 (Senior High School)
0,PHILIPPINES,2001,,90.1,57.55,,,,,,68.18,69.97,
1,NCR: National Capital Region,2001,,97.82,67.84,,,,,,74.29,68.43,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,,,,,59.55,61.75,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,,,,,79.7,75.35,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,,,,,74.07,69.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,,,,,,,
392,Region 11: Davao Region,2022,,,,,,,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,,,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,,,,,,,


#### Number of Technical-Vocational Education and Training (TVET) trainers trained

In [100]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/4.c.s2.csv')
data

Unnamed: 0,4.c.s2 Number of Technical-Vocational Education and Training (TVET) trainers trained,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,..,6518.0,11159.0,10118.0,10855.0,4023.0,7746.0,...
3,..National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,..,610.0,1028.0,1280.0,1409.0,782.0,1985.0,...
4,..Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,..,201.0,302.0,166.0,260.0,92.0,199.0,...
5,..Region I,..,..,..,..,..,..,..,..,..,...,..,..,..,474.0,455.0,475.0,501.0,375.0,327.0,...
6,..Region II,..,..,..,..,..,..,..,..,..,...,..,..,..,270.0,612.0,447.0,686.0,215.0,240.0,...
7,..Region III,..,..,..,..,..,..,..,..,..,...,..,..,..,280.0,262.0,354.0,839.0,277.0,471.0,...
8,..Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,..,833.0,1067.0,1440.0,817.0,177.0,647.0,...
9,..MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,..,139.0,523.0,709.0,413.0,162.0,255.0,...


In [101]:
data = data.drop(data.index[20:])

In [102]:
data.at[0, '4.c.s2 Number of Technical-Vocational Education and Training (TVET) trainers trained'] = 'Geolocation'

In [103]:
data.columns = data.loc[0]
data = data.drop (data.index[0])
data = data.reset_index (drop=True)

data = data.drop (data.index[0])
data = data.reset_index (drop=True)

In [104]:
data ['Geolocation'] = region_names

In [105]:
for c in data.columns.difference(['Geolocation']):
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# data = data.dropna(axis=1, how = 'all')

In [106]:
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

data.rename(columns = {'value':'4.c.s2', 0 : 'Year'}, inplace=True)
data = data.astype({'Year':'int'})

In [107]:
combined_data = combined_data.merge(data, how = 'outer', on = ['Geolocation', 'Year'])
combined_data = combined_data.reset_index (drop=True)

In [108]:
combined_data

Unnamed: 0,Geolocation,Year,1.2.1,1.4.1p5,1.4.1p6 (Junior High School),1.4.1p6 (Senior High School),1.5.4,3.4.1 (Total data),3.7.1,3.7.2,4.1.s1 (Elementary),4.1.s1 (Junior High School),4.1.s1 (Senior High School),4.c.s2
0,PHILIPPINES,2001,,90.1,57.55,,,,,,68.18,69.97,,
1,NCR: National Capital Region,2001,,97.82,67.84,,,,,,74.29,68.43,,
2,CAR: Cordillera Administrative Region,2001,,92.89,59.84,,,,,,59.55,61.75,,
3,Region 1: Ilocos Region,2001,,91.33,68.21,,,,,,79.7,75.35,,
4,Region 2: Cagayan Valley,2001,,89.45,59.67,,,,,,74.07,69.4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,Region 10: Northern Mindanao,2022,,,,,,,,,,,,
392,Region 11: Davao Region,2022,,,,,,,,,,,,
393,Region 12: SOCCSKSARGEN,2022,,,,,,,,,,,,
394,CARAGA: Cordillera Administrative Region,2022,,,,,,,,,,,,
