# <u>Main Code</u>

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import urllib.request
from urllib.parse import urlparse
import time
import os
import re
import urllib3

# declares timer and locks absolute path to working directory
before = time.perf_counter()
absolute_path = os.getcwd()

# solves SSL certificate issues error when retrieving files 
import os, ssl
if (not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None)): 
    ssl._create_default_https_context = ssl._create_unverified_context

# Solving SSL certificate issue
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# retrieve links from site
url = "https://www.junaeb.cl/ive"
page = requests.get(url, verify = False)
soup = BeautifulSoup(page.text, "lxml")
a_tags = soup.find_all('a')
all_links = [link.get('href') for link in a_tags]
links = []
for link in all_links:
    if ".xls" in link and "IVESINAE_COMUNA_2013" not in link:
        links.append(link)
        
years_df = []

# creates data_temp folder and changes working directory
if os.path.isdir("data_temp") == False:
    os.mkdir("data_temp")    
os.chdir("data_temp")

# downloads files
for url in links:
    encoded_url = urllib.parse.quote(url.encode('utf-8'),':/')
    filename = encoded_url[encoded_url.rfind("/")+1:]
    if filename not in os.listdir():
        urllib.request.urlretrieve(encoded_url, filename)
        print("{:.2f} s | Downloaded {}".format(time.perf_counter()-before, url))

# processing function
def clean(file, year):
    
    #defines scope of variable
    global years_df
        
    # reads Excel file and defines available sheets on file
    df = pd.read_excel(file, sheet_name = None, encoding="utf-8")
    sheets_list = list(df.keys())
    sheets = sheets_list[:2]

    for tb in sheets:
        # deletes last two columns and last row
        df[tb] = df[tb].drop(df[tb].columns[14:], axis=1)
        last_row = df[tb].shape[0]
        df[tb] = df[tb].drop(last_row-1)

        # fills NaN with 0
        df[tb].iloc[:, 9:] = df[tb].iloc[:, 9:].fillna(0)

        # creates new column to make dataframe tidy
        tb_col = tb.replace("Á","A")
        df[tb]["level"] = pd.Series([tb_col] * df[tb].shape[0])
            
        # creates new year column
        df[tb]["year"] = pd.Series([year] * df[tb].shape[0])

        # melts dataframe to make it tidy
        priorities = df[tb].columns[9:14]
        df[tb] = pd.melt(df[tb], id_vars = [x for x in df[tb].columns if x not in priorities], value_vars = priorities, var_name = "priority", value_name = "total")

        # changes column names
        df[tb].columns = ["rbd", "dv_rbd", "school_name", "dependency", "area", "region_code", "province_code", "commune_code", "commune_name", "level", "year", "priority", "total"]
        
        # drops unnecesary columns: "dv_rbd", "school_name", "region_code", "province_code" and "commune_name"
        df[tb] = df[tb][["rbd", "dependency", "area", "commune_code", "level", "year", "priority", "total"]]
        
        # changes column types
        num_cols = ["rbd", "commune_code", "total"]
        df[tb][num_cols] = df[tb][num_cols].apply(pd.to_numeric, downcast="integer")
        
        # changes string columns to uppercase
        def uppercase(row, col):
            return row[col].upper()
        
        str_cols = [x for x in df[tb].columns if x not in num_cols]
        
        for column in str_cols:
            df[tb][column] = df[tb].apply(uppercase, col = column, axis = 1) 
         
    # concatenates both dataframes
    df = pd.concat([df[sheets[0]], df[sheets[1]]], ignore_index=True)
    years_df.append(df)

# function to get year information
def get_year(string):
    reg = re.search("\d", string)
    first = reg.start()
    y = string[first : first + 4]
    return y 

# processes each link
for file in os.listdir():
    new_filename = file[file.rfind("/") + 1:]
    print("{:.2f} s | Cleaning {}".format(time.perf_counter()-before, new_filename))
    clean(new_filename, get_year(new_filename))

# concatenates each year's dataframe
df = pd.concat(years_df, ignore_index=True)
print("{:.2f} s | Concatenated each year's dataframe.".format(time.perf_counter()-before))

# classifies dependencies: administration
def classify_dependencies(row, col):
    dep_dict = {"SUB": 3, "CORP": 1, "MUNI": 2, "DELE": 2}
    return next((dep_dict[k] for k in dep_dict.keys() if k in row[col]), np.nan)
    
df["dependency"] = df.apply(classify_dependencies, col = "dependency", axis = 1)
df = df.rename(columns = {"dependency":"administration"})
print("{:.2f} s | Classified administration column.".format(time.perf_counter()-before))

# Changes IDs and column names (area: zone)
def zone(row, col):
    zone_dict = {"URBANO": 1, "RURAL": 2}
    return next((zone_dict[k] for k in zone_dict.keys() if k in row[col]), np.nan)
    
df["area"] = df.apply(zone, col = "area", axis = 1)
df = df.rename(columns = {"area":"zone_id"})
print("{:.2f} s | Changed zone IDs.".format(time.perf_counter()-before))

# classifies priorities
def classify_priorities(row, col):
    pri_dict = {"SIN INFORMACION": 0, "SIN INFORMACIÓN": 0, "PRIMERA PRIORIDAD": 1, "1ª PRIORIDAD": 1, "SEGUNDA PRIORIDAD": 2, "2ª PRIORIDAD": 2, "TERCERA PRIORIDAD": 3, "3ª PRIORIDAD": 3, "NO VULNERABLES": 4, "NO APLICA": 4, "NO PRIORIZADO EN VULNERABILIDAD": 4}
    return pri_dict[row[col]]
    
df["priority"] = df.apply(classify_priorities, col = "priority", axis = 1)
df["priority"] = pd.to_numeric(df["priority"], downcast = "integer")
print("{:.2f} s | Classified priority column.".format(time.perf_counter()-before))

# Changes IDs and column names (level: education)
def education(row, col):
    ed_dict = {"BASICA": 1, "MEDIA": 2}
    return next((ed_dict[k] for k in ed_dict.keys() if k in row[col]), np.nan)
    
df["level"] = df.apply(education, col = "level", axis = 1)
df = df.rename(columns = {"level":"education"})
print("{:.2f} s | Changed education IDs.".format(time.perf_counter()-before))

# writes datachile official IDs for each commune and drops columns: dv_rbd and school_name
df_ids = pd.read_csv("https://raw.githubusercontent.com/datachile/datachile-etl/master/official_ids/2017_06_27_comunas_datachile_fixed.csv")
df = pd.merge(df, df_ids, left_on = "commune_code", right_on = "comuna_customs_id")
df = df[["rbd", "administration", "zone_id", "comuna_datachile_id", "education", "year", "priority", "total"]]
df = df.rename(columns = {"administration": "administration_id", "comuna_datachile_id": "comuna_id", "education": "education_id", "priority": "priority_id"})

# comes back to original path, creates data_final folder and exports as csv
os.chdir(absolute_path)
if os.path.isdir("data_final") == False:
    os.mkdir("data_final")    
os.chdir("data_final")
df.to_csv("ive_junaeb.csv", index = False)
print("{:.2f} s | Exported CSV file.".format(time.perf_counter()-before))

# creates CSV with priority IDs
pri_tb = {"id": list(range(5)), "name_es": ["Sin información", "Primera prioridad", "Segunda prioridad", "Tercera prioridad", "No vulnerables"], "name_en": ["No information", "First priority", "Second priority", "Third priority", "Not vulnerable"]}
pri_df = pd.DataFrame(pri_tb)
pri_df.to_csv("priority.csv", index = False)
print("{:.2f} s | Exported priority.csv".format(time.perf_counter()-before))

# comes back to original path
os.chdir(absolute_path)

5.96 s | Downloaded http://www.junaeb.cl/wp-content/uploads/2013/02/IVE-2019-1.xlsx
9.50 s | Downloaded http://www.junaeb.cl/wp-content/uploads/2018/03/IVE-POR-RBD-BASICA-MEDIA-COMUNA-2018.xlsx
12.76 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2017/03/PRIORIDADES-POR-RBD-CON-IVE-SINAE-2017-BASICA-MEDIA-COMUNA.xlsx
16.17 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2016/01/PRIORIDADES-2016-CON-IVE-SINAE-BASICA-MEDIA-Y-COMUNAL_1.xlsx
19.41 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2015/01/PRIORIDADES-2015-CON-IVE-SINAE-BASICA-MEDIA-Y-COMUNAL.xlsx
22.66 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2013/02/IVE-SINAE-BASICA-MEDIA-Y-COMUNAL-2014-OFICIAL-09012014.xlsx
25.98 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2013/02/IVE-SINAE_2013-OFICIAL-14022013.xlsx
29.41 s | Downloaded https://www.junaeb.cl/wp-content/uploads/2013/02/PRIORIDADES-2012-BÁSICA-MEDIA-COMUNA-CON-IVE-SINAE-OFICIAL.xlsx
35.14 s | Downloaded https://www.junaeb.c