# <u>MAIN CODE</u>

In [32]:
import os
os.chdir("/Users/mauricio/datachile-etl/childhood/dropout_rate")
os.getcwd()
os.listdir()

['.DS_Store', 'data_temp', 'dropout_rate.ipynb', '.ipynb_checkpoints']

In [42]:
import time
import os
import pandas as pd
import numpy as np
import re

# declares timer, locks absolute_path to working directory, declares file URLs and list of df
before = time.perf_counter()
absolute_path = os.getcwd()
urls = ["http://www.creciendoconderechos.gob.cl/docs/Rendimiento_Escolar_Basica.xlsx", "http://www.creciendoconderechos.gob.cl/docs/Rendimiento_Escolar_Media.xlsx"]
df_list = []

# creates data_temp folder and changes working directory
if os.path.isdir("data_temp") == False:
    os.mkdir("data_temp")    
os.chdir("data_temp")

# generates header row
hrow = ["region_code", "region_name", "province_code", "province_name", "commune_code", "commune_name"]
for yr in range(2010,2018):
    hrow.append("total_"+str(yr))
    hrow.append("prom_num_"+str(yr))
    hrow.append("prom_perc_"+str(yr))
    hrow.append("rep_num_"+str(yr))
    hrow.append("rep_perc_"+str(yr))
    hrow.append("drop_num_"+str(yr))
    hrow.append("drop_perc_"+str(yr))

# processing
df = pd.read_excel("Rendimiento_Escolar_Basica.xlsx", header = None, sheet_name = "Información Base Comunal", skiprows = list(range(5)))
df.columns = hrow
#df[hrow[:6]] = df[hrow[:6]].fillna(method = "ffill")

# selects necessary columns
sel_cols = ["commune_code"]
for yr in range(2010,2018):
    sel_cols.append("prom_num_"+str(yr))
    sel_cols.append("rep_num_"+str(yr))
    sel_cols.append("drop_num_"+str(yr))
    
df = df[sel_cols]

# melts columns to make dataframe tidy
melt_cols = [col for col in df.columns if col != "commune_code"]
df = pd.melt(df, id_vars = "commune_code", value_vars = melt_cols, var_name = "status_year", value_name = "value")

# creates year column
def get_year(row, col):
    target = row[col]
    reg = re.search("\d", target)
    first = reg.start()
    y = target[first : first + 4]
    return y

df["year"] = df.apply(get_year, col = "status_year", axis = 1)

# creates status column
def get_status(row, col):
    stat = {"prom": 1, "rep": 2, "drop": 3}
    return next((stat[k] for k in stat.keys() if k in row[col]), np.nan)

df["status"] = df.apply(get_status, col = "status_year", axis = 1)

# drops status_year column and NaN rows on commune_id
df = df[[c for c in df.columns if c != "status_year"]]
df = df.dropna(subset = ["commune_code"])

# creates education column
ed = re.search("_(.+?)_(.+?).xlsx", urls[0])
ed = ed.group(2)
df["education"] = pd.Series([ed] * df.shape[0])

print(df.iloc[8250:,:])

# comes back to original path
os.chdir(absolute_path)

      commune_code value  year  status education
8342       12401.0     4  2017       3       NaN
8343       12402.0     1  2017       3       NaN
8344       13101.0   686  2017       3       NaN
8345       13102.0    80  2017       3       NaN
8346       13103.0   152  2017       3       NaN
8347       13104.0   196  2017       3       NaN
8348       13105.0   196  2017       3       NaN
8349       13106.0   234  2017       3       NaN
8350       13107.0    91  2017       3       NaN
8351       13108.0   307  2017       3       NaN
8352       13109.0   150  2017       3       NaN
8353       13110.0   396  2017       3       NaN
8354       13111.0   125  2017       3       NaN
8355       13112.0   295  2017       3       NaN
8356       13113.0   113  2017       3       NaN
8357       13114.0   155  2017       3       NaN
8358       13115.0    61  2017       3       NaN
8359       13116.0    70  2017       3       NaN
8360       13117.0    87  2017       3       NaN
8361       13118.0  

# <u>IVE JUNAEB Code</u>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import urllib.request
from urllib.parse import urlparse
import time
import os
import re
import urllib3

# declares timer and locks absolute path to working directory
before = time.perf_counter()
absolute_path = os.getcwd()

# solves SSL certificate issues error when retrieving files 
#import os, ssl
#if (not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None)): 
#    ssl._create_default_https_context = ssl._create_unverified_context

# Solving SSL certificate issue
#urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# retrieve links from site
url = "https://www.junaeb.cl/ive"
page = requests.get(url, verify = False)
soup = BeautifulSoup(page.text, "lxml")
a_tags = soup.find_all('a')
all_links = [link.get('href') for link in a_tags]
links = []
for link in all_links:
    if ".xls" in link and "IVESINAE_COMUNA_2013" not in link:
        links.append(link)
        
years_df = []

# creates data_temp folder and changes working directory
if os.path.isdir("data_temp") == False:
    os.mkdir("data_temp")    
os.chdir("data_temp")

# downloads files
for url in links:
    encoded_url = urllib.parse.quote(url.encode('utf-8'),':/')
    filename = encoded_url[encoded_url.rfind("/")+1:]
    if filename not in os.listdir():
        urllib.request.urlretrieve(encoded_url, filename)
        print("{:.2f} s | Downloaded {}".format(time.perf_counter()-before, url))

# processing function
def clean(file, year):
    
    #defines scope of variable
    global years_df
        
    # reads Excel file and defines available sheets on file
    df = pd.read_excel(file, sheet_name = None, encoding="utf-8")
    sheets_list = list(df.keys())
    sheets = sheets_list[:2]

    for tb in sheets:
        # deletes last two columns and last row
        df[tb] = df[tb].drop(df[tb].columns[14:], axis=1)
        last_row = df[tb].shape[0]
        df[tb] = df[tb].drop(last_row-1)

        # fills NaN with 0
        df[tb].iloc[:, 9:] = df[tb].iloc[:, 9:].fillna(0)

        # creates new column to make dataframe tidy
        tb_col = tb.replace("Á","A")
        df[tb]["level"] = pd.Series([tb_col] * df[tb].shape[0])
            
        # creates new year column
        df[tb]["year"] = pd.Series([year] * df[tb].shape[0])

        # melts dataframe to make it tidy
        priorities = df[tb].columns[9:14]
        df[tb] = pd.melt(df[tb], id_vars = [x for x in df[tb].columns if x not in priorities], value_vars = priorities, var_name = "priority", value_name = "total")

        # changes column names
        df[tb].columns = ["rbd", "dv_rbd", "school_name", "dependency", "area", "region_code", "province_code", "commune_code", "commune_name", "level", "year", "priority", "total"]
        
        # drops unnecesary columns: "dv_rbd", "school_name", "region_code", "province_code" and "commune_name"
        df[tb] = df[tb][["rbd", "dependency", "area", "commune_code", "level", "year", "priority", "total"]]
        
        # changes column types
        num_cols = ["rbd", "commune_code", "total"]
        df[tb][num_cols] = df[tb][num_cols].apply(pd.to_numeric, downcast="integer")
        
        # changes string columns to uppercase
        def uppercase(row, col):
            return row[col].upper()
        
        str_cols = [x for x in df[tb].columns if x not in num_cols]
        
        for column in str_cols:
            df[tb][column] = df[tb].apply(uppercase, col = column, axis = 1) 
         
    # concatenates both dataframes
    df = pd.concat([df[sheets[0]], df[sheets[1]]], ignore_index=True)
    years_df.append(df)

# function to get year information
def get_year(string):
    reg = re.search("\d", string)
    first = reg.start()
    y = string[first : first + 4]
    return y 

# processes each link
for file in os.listdir():
    new_filename = file[file.rfind("/") + 1:]
    print("{:.2f} s | Cleaning {}".format(time.perf_counter()-before, new_filename))
    clean(new_filename, get_year(new_filename))

# concatenates each year's dataframe
df = pd.concat(years_df, ignore_index=True)
print("{:.2f} s | Concatenated each year's dataframe.".format(time.perf_counter()-before))

# classifies dependencies: administration
def classify_dependencies(row, col):
    dep_dict = {"SUB": 3, "CORP": 1, "MUNI": 2, "DELE": 5}
    return next((dep_dict[k] for k in dep_dict.keys() if k in row[col]), np.nan)
    
df["dependency"] = df.apply(classify_dependencies, col = "dependency", axis = 1)
df = df.rename(columns = {"dependency":"administration"})
print("{:.2f} s | Classified administration column.".format(time.perf_counter()-before))

# Changes IDs and column names (area: zone)
def zone(row, col):
    zone_dict = {"URBANO": 1, "RURAL": 2}
    return next((zone_dict[k] for k in zone_dict.keys() if k in row[col]), np.nan)
    
df["area"] = df.apply(zone, col = "area", axis = 1)
df = df.rename(columns = {"area":"zone_id"})
print("{:.2f} s | Changed zone IDs.".format(time.perf_counter()-before))

# classifies priorities
def classify_priorities(row, col):
    pri_dict = {"SIN INFORMACION": 0, "SIN INFORMACIÓN": 0, "PRIMERA PRIORIDAD": 1, "1ª PRIORIDAD": 1, "SEGUNDA PRIORIDAD": 2, "2ª PRIORIDAD": 2, "TERCERA PRIORIDAD": 3, "3ª PRIORIDAD": 3, "NO VULNERABLES": 4, "NO APLICA": 4}
    return pri_dict[row[col]]
    
df["priority"] = df.apply(classify_priorities, col = "priority", axis = 1)
df["priority"] = pd.to_numeric(df["priority"], downcast = "integer")
print("{:.2f} s | Classified priority column.".format(time.perf_counter()-before))

# Changes IDs and column names (level: education)
def education(row, col):
    ed_dict = {"BASICA": 1, "MEDIA": 2}
    return next((ed_dict[k] for k in ed_dict.keys() if k in row[col]), np.nan)
    
df["level"] = df.apply(education, col = "level", axis = 1)
df = df.rename(columns = {"level":"education"})
print("{:.2f} s | Changed education IDs.".format(time.perf_counter()-before))

# writes datachile official IDs for each commune and drops columns: dv_rbd and school_name
df_ids = pd.read_csv("https://raw.githubusercontent.com/datachile/datachile-etl/master/official_ids/2017_06_27_comunas_datachile_fixed.csv")
df = pd.merge(df, df_ids, left_on = "commune_code", right_on = "comuna_customs_id")
df = df[["rbd", "administration", "zone_id", "comuna_datachile_id", "education", "year", "priority", "total"]]
df = df.rename(columns = {"administration": "administration_id", "comuna_datachile_id": "comuna_id", "education": "education_id", "priority": "priority_id"})

# comes back to original path, creates data_final folder and exports as csv
os.chdir(absolute_path)
if os.path.isdir("data_final") == False:
    os.mkdir("data_final")    
os.chdir("data_final")
df.to_csv("ive_junaeb.csv", index = False)
print("{:.2f} s | Exported CSV file.".format(time.perf_counter()-before))

# creates CSV with priority IDs
pri_tb = {"id": list(range(5)), "name_es": ["Sin información", "Primera prioridad", "Segunda prioridad", "Tercera prioridad", "No vulnerables"], "name_en": ["No information", "First priority", "Second priority", "Third priority", "Not vulnerable"]}
pri_df = pd.DataFrame(pri_tb)
pri_df.to_csv("priority.csv", index = False)
print("{:.2f} s | Exported priority.csv".format(time.perf_counter()-before))

# comes back to original path
os.chdir(absolute_path)