In [153]:
import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

# Combining the Datasets 

#### Proportion of population living below the national poverty line 

In [157]:
'''
STEPS:
-> change the filename into something shorter (if sdg, only the "number")
1. load the data to a variable
2. make the first row into the column header
3. drop the first row that was made into the column header + other irrelevant rows
4. reset the index
5. change the region names para same lahat ng datasets
6. make it into a long representation
7. change the column name to the name of the new column 
8. add it to the combined data set
'''
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.2.1.csv')
data

Unnamed: 0,1.2.1 Proportion of population living below the national poverty line by sex age 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,Year,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
1,Geolocation,,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
3,..National Capital Region (NCR),..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
4,..Cordillera Administrative Region (CAR),..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
5,..Region I,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
6,..Region II,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
7,..Region III,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
8,..Region IV-A,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
9,..MIMAROPA,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..


In [158]:
# NOTE: Before applying, make sure that the arrangement of the regions are the same as the arrangement in your table
region_names = ['PHILIPPINES', 'NCR: National Capital Region', 
                 'CAR: Cordillera Administrative Region', 
                 'Region 1: Ilocos Region', 
                 'Region 2: Cagayan Valley', 
                 'Region 3: Central Luzon', 
                 'Region 4A: CALABARZON', 
                'MIMAROPA: Southwestern Tagalog Region', 
                'Region 5: Bicol Region', 
                'Region 6: Western Visayas', 
                'Region 7: Central Visayas', 
                'Region 8: Eastern Visayas', 
                'Region 9: Zamboanga Peninsula', 
                'Region 10: Northern Mindanao', 
                'Region 11: Davao Region', 
                'Region 12: SOCCSKSARGEN', 
                'CARAGA: Cordillera Administrative Region', 
                'BARMM: Bangsamoro Autonomous Region in Muslim Mindanao']

In [159]:
# setting our column names, as we can see from above, the headers can be seen at the 0th index
data.columns = data.iloc [0] 
print(data)
# dropping the 'geolocation' row as that is actually used as a header
data = data.drop (data.index [1])
# dropping the column names 
data = data.drop (data.index [0])

0                                                Year  2000  2001  2002  2003  \
0                                                Year  2000  2001  2002  2003   
1                                         Geolocation   NaN   NaN   NaN   NaN   
2                                         PHILIPPINES    ..    ..    ..    ..   
3                     ..National Capital Region (NCR)    ..    ..    ..    ..   
4            ..Cordillera Administrative Region (CAR)    ..    ..    ..    ..   
5                                          ..Region I    ..    ..    ..    ..   
6                                         ..Region II    ..    ..    ..    ..   
7                                        ..Region III    ..    ..    ..    ..   
8                                       ..Region IV-A    ..    ..    ..    ..   
9                                          ..MIMAROPA    ..    ..    ..    ..   
10                                         ..Region V    ..    ..    ..    ..   
11                          

In [147]:
data.reset_index (drop=True, inplace=True)

# renames the column 'Year' as its actually the location column
data.rename(columns = {'Year':'Geolocation'}, inplace=True)
data = data.drop (data.index [18:]) # irrelevant rows 
# renames the data in the Geolocation for consistency
data['Geolocation'] = region_names
data.set_index('Geolocation')
data = data.reset_index(drop=True)
data

Unnamed: 0,Geolocation,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
0,PHILIPPINES,..,..,..,..,..,..,..,..,..,...,..,..,23.5,..,..,16.7,..,..,...,..
1,NCR: National Capital Region,..,..,..,..,..,..,..,..,..,...,..,..,4.1,..,..,2.2,..,..,...,..
2,CAR: Cordillera Administrative Region,..,..,..,..,..,..,..,..,..,...,..,..,22.7,..,..,12.0,..,..,...,..
3,Region 1: Ilocos Region,..,..,..,..,..,..,..,..,..,...,..,..,18.8,..,..,9.9,..,..,...,..
4,Region 2: Cagayan Valley,..,..,..,..,..,..,..,..,..,...,..,..,17.8,..,..,16.3,..,..,...,..
5,Region 3: Central Luzon,..,..,..,..,..,..,..,..,..,...,..,..,10.5,..,..,7.0,..,..,...,..
6,Region 4A: CALABARZON,..,..,..,..,..,..,..,..,..,...,..,..,12.5,..,..,7.1,..,..,...,..
7,MIMAROPA: Southwestern Tagalog Region,..,..,..,..,..,..,..,..,..,...,..,..,25.2,..,..,15.1,..,..,...,..
8,Region 5: Bicol Region,..,..,..,..,..,..,..,..,..,...,..,..,39.8,..,..,27.0,..,..,...,..
9,Region 6: Western Visayas,..,..,..,..,..,..,..,..,..,...,..,..,24.6,..,..,16.3,..,..,...,..


In [149]:
for c in data.columns.difference(['Geolocation']):
    # cells without values are represented as either '..' or '...', so we should convert them to NaN so we could dropna()
    data [c].replace(to_replace='..', value= np.nan, inplace= True)
    data [c].replace(to_replace='...', value= np.nan, inplace= True)

# drops columns if all of the values are NaN
data = data.dropna(axis=1)

In [150]:
# converting from a wide representation to a long representation
data = pd.melt(data, id_vars='Geolocation', value_vars=data.columns [2:]) 

# renaming the columns into a more readable anmes
data.rename(columns = {'value':'1.2.1', 0 : 'Year'}, inplace=True)

# making the year type into integer
data = data.astype({'Year':'int'})

In [151]:
data

Unnamed: 0,Geolocation,Year,1.2.1
0,PHILIPPINES,2018,16.7
1,NCR: National Capital Region,2018,2.2
2,CAR: Cordillera Administrative Region,2018,12.0
3,Region 1: Ilocos Region,2018,9.9
4,Region 2: Cagayan Valley,2018,16.3
5,Region 3: Central Luzon,2018,7.0
6,Region 4A: CALABARZON,2018,7.1
7,MIMAROPA: Southwestern Tagalog Region,2018,15.1
8,Region 5: Bicol Region,2018,27.0
9,Region 6: Western Visayas,2018,16.3


In [152]:
combined_data = data

#### Net Enrolment Rate in elementary

In [7]:
data = pd.read_csv(os.getenv('DSDATA_PROJ') + '/1.4.1p5.csv')
data

Unnamed: 0,1.4.1p5 Net Enrolment Rate in elementary (Indicator is also found in SDG 4.3.s1) 1/,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,Year,2000,2001,2002.00,2003.00,2004.00,2005.00,2006.00,2007.00,...,2013.00,2014.00,2015.00,2016.00,2017.00,2018.00,2019.00,2020.0000,2021,2022
1,Geolocation,Sex,,,,,,,,,...,,,,,,,,,,
2,PHILIPPINES,Both Sexes,96.77,90.1,90.29,88.74,87.11,84.44,83.22,84.93,...,97.20,97.19,96.90,96.15,94.19,94.05,93.96,89.1064,...,...
3,,Boys,96.27,89.33,89.51,87.84,86.17,83.56,82.39,84.07,...,96.74,96.87,96.66,96.17,94.12,94.25,93.79,88.9318,...,...
4,,Girls,97.28,90.91,91.10,89.68,88.08,85.35,84.08,85.83,...,97.68,97.53,97.15,96.12,94.27,93.85,94.15,89.2898,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Note:,,,,,,,,,,...,,,,,,,,,,
59,.. - Data not available,,,,,,,,,,...,,,,,,,,,,
60,... - Data not yet available,,,,,,,,,,...,,,,,,,,,,
61,1/ - Updates were based on the submission of D...,,,,,,,,,,...,,,,,,,,,,


In [8]:
data.columns = data.iloc [0]
data = data.drop (data.index [1])
data = data.drop (data.index [0])
data.reset_index (drop=True, inplace=True)
data.rename(columns = {'Year':'Geolocation'}, inplace=True)
data = data.drop (data.index [18:]) # irrelevant rows 
data['Geolocation'] = region_names
data

Unnamed: 0,NaN,Geolocation,2000,2001,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,...,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021,2022
0,PHILIPPINES,PHILIPPINES,96.77,90.1,90.29,88.74,87.11,84.44,83.22,84.93,...,97.2,97.19,96.9,96.15,94.19,94.05,93.96,89.1064,...,...
1,,NCR: National Capital Region,96.27,89.33,89.51,87.84,86.17,83.56,82.39,84.07,...,96.74,96.87,96.66,96.17,94.12,94.25,93.79,88.9318,...,...
2,,CAR: Cordillera Administrative Region,97.28,90.91,91.1,89.68,88.08,85.35,84.08,85.83,...,97.68,97.53,97.15,96.12,94.27,93.85,94.15,89.2898,...,...
3,..National Capital Region (NCR),Region 1: Ilocos Region,101.0,97.82,97.38,96.81,94.82,92.61,92.89,94.42,...,99.64,99.01,99.85,95.92,92.83,92.11,89.91,81.1478,...,...
4,,Region 2: Cagayan Valley,100.13,96.57,96.52,95.81,93.75,91.65,92.0,93.21,...,98.77,98.13,98.8,95.3,92.2,91.85,89.43,80.6316,...,...
5,,Region 3: Central Luzon,101.92,99.13,98.28,97.87,95.95,93.63,93.83,95.69,...,100.57,99.95,100.95,96.58,93.5,92.38,90.42,81.6903,...,...
6,..Cordillera Administrative Region (CAR),Region 4A: CALABARZON,94.42,92.89,91.52,89.19,86.4,82.58,80.86,81.5,...,99.66,100.16,99.19,97.24,94.37,92.24,91.4,87.5276,...,...
7,,MIMAROPA: Southwestern Tagalog Region,94.26,91.96,90.53,88.36,85.52,81.75,80.19,81.01,...,99.85,100.27,99.42,97.94,95.13,93.45,92.25,88.5518,...,...
8,,Region 5: Bicol Region,94.58,93.88,92.57,90.07,87.31,83.46,81.57,82.01,...,99.47,100.05,98.95,96.51,93.59,90.99,90.51,86.4657,...,...
9,..Region I,Region 6: Western Visayas,97.73,91.33,89.64,88.52,86.98,84.87,82.74,83.14,...,97.39,97.84,96.78,94.84,92.5,90.48,89.99,86.2185,...,...


Unnamed: 0,Geolocation,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015.0,2016,2017,2018.0,2019,2020,2021,2022
0,,,,,,,,,,,...,,,23.5,,,16.7,,,,
1,,,,,,,,,,,...,,,4.1,,,2.2,,,,
2,,,,,,,,,,,...,,,22.7,,,12.0,,,,
3,,,,,,,,,,,...,,,18.8,,,9.9,,,,
4,,,,,,,,,,,...,,,17.8,,,16.3,,,,
5,,,,,,,,,,,...,,,10.5,,,7.0,,,,
6,,,,,,,,,,,...,,,12.5,,,7.1,,,,
7,,,,,,,,,,,...,,,25.2,,,15.1,,,,
8,,,,,,,,,,,...,,,39.8,,,27.0,,,,
9,,,,,,,,,,,...,,,24.6,,,16.3,,,,
