In [1]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as py
import matplotlib.pyplot as plot
import seaborn as sns
from matplotlib import rc

import matplotlib.mlab as mlab

In [None]:
dfMain = pd.read_csv('main.csv', low_memory=False);
dfMem = pd.read_csv('hpq_mem.csv', low_memory=False);

#Loads 'main.csv' and 'hpq_mem.csv'

In [None]:
dfMain = dfMain.filter(['mainid','brgy']);
dfMem = dfMem.filter(['mainid','educal']);

#Filters the two dataframes to show only the relevant columns

In [None]:
dfMainMem = pd.merge(dfMain, dfMem, left_on='mainid', right_on='mainid', how='left');

# Joins dfMain with dfMem using matching mainids
dfMainMem

In [None]:
dfMainMem['family_size'] = dfMainMem.mainid.map(dfMainMem.mainid.value_counts())
# Determines the family size based on the number of responses.
# Educal is the highest educational attainment of each family member.
# We were forced to assume that family size equals VALID educal responses count because there is no data about family size.

dfMainMem

In [None]:
dfMainMem = dfMainMem.loc[dfMainMem.educal < 38.0] 
# Removes responses with less than 32 (first year college) as educational attainment
# It automatically drops NaN and other invalid responses as well

dfMainMem = dfMainMem.loc[dfMainMem.educal > 31]
# Removes responses with greater than 37(PhD) as educational attainment
# It automatically drops NaN and other invalid responses as well

# ASSUMPTION: Invalid responses are automatically considered to not be in the 1st College to PhD range.

dfMainMem['above_PS'] = dfMainMem.mainid.map(dfMainMem.mainid.value_counts())
# Determines the number of current college takers and college grads based on the number of educal responses in the filtered data.
# Educal is the highest educational attainment of each family members.

dfMainMem


In [None]:
dfMainMem = dfMainMem.drop(columns=['educal'])
# Drops the educal column which we dont have use for anymore

dfMainMem.drop_duplicates()
# Drop duplicate entries borne out of having many responses with 32 or more as educational attainment.

dfMainMem['score'] = dfMainMem['above_PS'] / dfMainMem['family_size']
# Creates a new column score which keeps the score of each family's literacy rate 
# by taking the ratio of college takers and grads over the total family size based on VALID educal responses.

dfMainMem

In [None]:
arrayBrgyScores = dfMainMem.groupby('brgy')['score'].mean()
# Groups the families by brgy and takes the average of their family literacy.
# Output is an array, not a dataframe.

arrayBrgyScores


In [None]:
dfBrgy1 = dfMainMem.loc[dfMainMem.brgy == 1]
dfBrgy1

# Just to check if output is correct, Brgy 1 has 3 respondents with a total of 1.7 score, therefore averaging on 0.566667

In [None]:
arrayBrgyScores.plot(figsize=(20, 10), kind='bar', grid=True, align='center', width=0.5, stacked=True)
plot.legend()
plot.show()


# Calamity Prep Score 1: The Mean College Literacy Rate of Each Baranggay

Assumptions: 
1. Based on a study by INSERT AUTHORS AND LINK, having college students and college grads in the family increases their chance of surviving calamities. And so we used EDUCAL (educational attainment) as a means of scoring the average post-secondary literacy of each family in each baranggay.
    
2. We assumed that the number of responses per family represents the total size of the family and so used it as the denominator to which we compared the number of college grads and students.

3. We assumed that invalid responses such as '300' and NaN values are automatically not within range of the 1st year college (32) to PhD (37). We did not drop these responses as it will cause an inaccurate family size. We tried dropping them but a 5 member family just became 2 because 3 of them did not properly answer and still had 100% literacy because the only 2 valid responses are within college and PhD range.
    
Methodology:
1. Created a filtered dataframe copy of the original Main.CSV file that only contains the relevant variables for Score 1.

2. Determined family size ('family_size') by counting the number of responses of with the same 'mainid' value.

3. Determined college students and grads ('above_PS') by counting the number of responses of with 32 to 37 as EDUCAL values.

4. Determined the ratio of 'above_PS' over 'family_size' to get each family's college literacy rate.

5. Grouped the families by baranggay and then computed for the mean college literacy rate of each baranggay.

In [None]:
dfScore1 = pd.DataFrame(arrayBrgyScores);
dfScore1.to_csv('BrgyScore1.csv', encoding='utf-8')

# The computed means for each baranggay's family college literacy rate is saved onto a CSV file for future use.