In [None]:
import os
import pandas as pd
import time
import openpyxl
os.chdir("/Users/mauricio/datachile-etl/health/deis_emergency_care")

In [None]:
if not os.path.isdir("./data_processing"):
    os.mkdir("./data_processing")

In [None]:
names = os.listdir("./data_temp")

In [None]:
# Extracting pieces of excel sheets to a whole excel file in a different folder
a = 0
for name in names:
    a += 1
    print(str(a) + ") " + name)
    wb = openpyxl.load_workbook("./data_temp/" + name)
    wk_region, wk_year = name.replace(".xlsx","").split("_")
    sh = wb.active

    single_letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    double_letters = ["A{}".format(l) for l in single_letters] + ["BA","BB"]
    letters = single_letters + double_letters
    zero_range = ["{}{}".format(l, str(n)) for n in range(1,27) for l in letters]
    
    # Processing Function
    def process(cell_range, tag):
        new = sh[cell_range]
        new_wb = openpyxl.Workbook()
        new_sh = new_wb.active
        data = ["{}".format(cell.value) for row in new for cell in row]    
        points = zip(zero_range,data)
        for p in points:
            new_sh[p[0]] = p[1]
        
        new_wb.save("./data_processing/" + wk_region + "_" + wk_year + "_" + tag + ".xlsx")

    # Actual processing
    process("A47:BB72", "less1")
    process("A76:BB101", "1to4")
    process("A105:BB130", "5to14")
    process("A134:BB159", "15to65")
    process("A163:BB188", "65plus")


In [None]:
# Loading filenames
files = os.listdir("./data_processing")

In [None]:
# Emergency dimension
e_dim = pd.read_csv("./data_final/emergency_table.csv")

# Age range dimension
age_dim = pd.read_csv("./data_final/age_table.csv")

# Geography dimension
geo_dim = pd.read_csv("./data_final/geography_table.csv")

In [None]:
# Date dimension
date_dim = pd.read_csv("./data_final/date_table.csv", dtype = "object")

In [None]:
# Tidying dataframes

df_list = []

a = 0
for file in files:
    a += 1
    print("{}) {}".format(a, file))
    df = pd.read_excel("./data_processing/" + file)
    f_region, f_year, f_age = file.replace(".xlsx","").split("_")
    df = df.drop(labels = [0,1,8,14,19])

    df = df.drop(columns = ["Total"]).rename(columns = {"None":"emergency"})
    df = pd.melt(df, id_vars = "emergency", var_name = "week", value_name = "total")
    df = df.merge(e_dim[["emergency_id", "emergency_index"]], left_on = "emergency", right_on = "emergency_index", how = "left")
    df = df.drop(columns = ["emergency", "emergency_index"])

    df["year"] = f_year
    df["date"] = df.year + "-" + df.week
    df = df.drop(columns = ["week", "year"])
    df = df.merge(date_dim[["date_id", "date_index"]], left_on = "date", right_on = "date_index", how = "left")
    df = df.drop(columns = ["date","date_index"])

    df["age"] = f_age
    df = df.merge(age_dim[["age_id", "age_index"]], left_on = "age", right_on = "age_index", how = "left")
    df = df.drop(columns = ["age", "age_index"])

    df["region"] = f_region
    df = df.merge(geo_dim[["geography_id", "geography_index"]], left_on = "region", right_on = "geography_index", how = "left")
    df = df.drop(columns = ["region", "geography_index"])

    if df["date_id"].isnull().any() == True:
        print("NULL DATE VALUES")

    df_list.append(df)


In [None]:
df = pd.concat(df_list, ignore_index = True)
df = df[df["total"] != "-"]
df = df.dropna()

df["date_id"] = df["date_id"].astype("int")
df = df.reset_index(drop = True)
df = df[["date_id", "geography_id", "emergency_id", "age_id", "total"]]

In [None]:
if not os.path.isdir("./data_final"):
    os.mkdir("./data_final")
    
df.to_csv("./data_final/fact_emergency.csv", index = False)