In [651]:
import tabula
import pandas as pd
import numpy as np
from unidecode import unidecode
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
    
files = {
    2014: {
        "1-sem": {
            "theory": "Taxasdeaprovacao_1_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_1_Semestre_2014_Praticas_Integradas.pdf"
        },
        "2-sem": {
            "theory": "Taxasdeaprovacao_2_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_2_Semestre_2014_Praticas_Integradas.pdf"
        }
    },    
    2015: "TaxasApr_2015_Relatorio_Escolas_de_Condução.pdf",
    2016: "TaxasApr_2016_Relatorio_EscolasDeConducao.pdf",
    2017: "TaxasApr_2017_Relatorio_EscolasDeConducao.pdf",
    2018: "TaxasApr_2018_Relatorio_EscolasDeConducao.pdf",
    2019: "EscolasdeCondução-2019.pdf",
    2020: "EscolasdeCondução-2020.pdf"
}

def parse_rate (x):
    
    if x == '#DIV/0!':
        return np.NAN
    elif pd.isna(x):
        return x
    else:
        return float((x.replace(',', '.').replace('%', 'e-2')))
    
def parse_int (x):
    
    if pd.isna(x):
        return np.NAN
    else:
        return np.Int64(x)
    
def name_to_keyword(x):
    s = unidecode(x) \
    .lower() \
    .replace('>', '') \
    .replace('<', '') \
    .replace('(', '') \
    .replace(')', '') \
    .replace(' ', '_') \
    .replace('escola_de_conducao_', '') \
    .replace('escola_do_', '') \
    .replace('/', '_') \
    .replace('-', '_') \
    .replace('___', '_') \
    .replace('__', '_') 

    return s
    
headers = ['n_ec', 'name_raw', 'theory_scheduled', 'theory_done', 'theory_rate', 'driving_scheduled', 'driving_done', 'driving_rate']

In [510]:
year=2015
path = files[year]
all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table = table.drop([8,9,10], axis=1)
table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2015 = table.set_index('k')
summary2015 = summary
assert(summary2015["driving_done"].values[0] == table2015["driving_done"].sum())

In [511]:
year=2016
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table = table.drop([8,9,10], axis=1)
table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2016 = table.set_index('k')
summary2016 = summary
assert(summary2016["driving_done"].values[0] == table2016["driving_done"].sum())

In [512]:
year=2017
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table = table.drop([8,9,10], axis=1)
table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2017 = table.set_index('k')
summary2017 = summary
assert(summary2017["driving_done"].values[0] == table2017["driving_done"].sum())

In [521]:
year=2018
path = files[year]
page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[15, 5, 95, 75], columns=[60, 70, 255, 340, 400, 450, 500, 560], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 5, 95, 75], columns=[60, 70, 255, 340, 400, 450, 500, 560], pages='2-23', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table = table.drop([0], axis=1)
table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2018 = table.set_index('k')
summary2018 = summary
assert(summary2018["driving_done"].values[0] == table2018["driving_done"].sum())

In [567]:
year=2019
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 0, 95, 77], columns=[35, 470, 800, 885, 1000, 1100, 1200], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[35, 470, 800, 885, 1000, 1100, 1200], pages='2-21', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2019 = table.set_index('k')
summary2019 = summary
assert(summary2019["driving_done"].values[0] == table2019["driving_done"].sum())
assert(summary2019["theory_done"].values[0] == table2019["theory_done"].sum())

In [658]:
year=2020
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[10, 0, 95, 80], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='2-20', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['theory_rate'] = table['theory_rate'].apply(parse_rate)
table['driving_rate'] = table['driving_rate'].apply(parse_rate)
table["driving_done"] = table["driving_done"].astype('Int64')
table["driving_scheduled"] = table["driving_scheduled"].astype('Int64')
table["theory_done"] = table["theory_done"].astype('Int64')
table["theory_scheduled"] = table["theory_scheduled"].astype('Int64')
table['k'] = table['name_raw'].apply(name_to_keyword)

table2020 = table.set_index('k')

print(table2020["theory_scheduled"].sum())
assert(table2020["theory_done"].sum() == 175301)
print(table2020["driving_scheduled"].sum())
print(table2020["driving_done"].sum())

189913
191114
177924
