In [767]:
import tabula
import pandas as pd
import numpy as np
from unidecode import unidecode
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
    
files = {
    2014: {
        "1-sem": {
            "theory": "Taxasdeaprovacao_1_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_1_Semestre_2014_Praticas_Integradas.pdf"
        },
        "2-sem": {
            "theory": "Taxasdeaprovacao_2_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_2_Semestre_2014_Praticas_Integradas.pdf"
        }
    },    
    2015: "TaxasApr_2015_Relatorio_Escolas_de_Condução.pdf",
    2016: "TaxasApr_2016_Relatorio_EscolasDeConducao.pdf",
    2017: "TaxasApr_2017_Relatorio_EscolasDeConducao.pdf",
    2018: "TaxasApr_2018_Relatorio_EscolasDeConducao.pdf",
    2019: "EscolasdeCondução-2019.pdf",
    2020: "EscolasdeCondução-2020.pdf"
}

def parse_rate (x):
    
    if x == '#DIV/0!':
        return np.NAN
    elif pd.isna(x):
        return x
    else:
        return float((x.replace(',', '.').replace('%', 'e-2')))
    
def parse_int (x):
    
    if pd.isna(x):
        return 
    else:
        return np.Int64(x)
    
def name_to_keyword(x):
    s = unidecode(x) \
    .lower() \
    .replace('>', '') \
    .replace('<', '') \
    .replace('(', '') \
    .replace(')', '') \
    .replace(' ', '_') \
    .replace('escola_de_conducao_', '') \
    .replace('escola_do_', '') \
    .replace('/', '_') \
    .replace('-', '_') \
    .replace('___', '_') \
    .replace('__', '_') 

    return s
    
headers = ['n_ec', 'name_raw', 't_scheduled', 't_done', 't_rate', 'd_scheduled', 'd_done', 'd_rate']
headers_full = ['n_ec', 'name_raw', 't_scheduled', 't_done', 't_rate', 'd_scheduled', 'd_done', 'd_rate', 'total_scheduled', 'total_done', 'total_rate']

In [816]:
year=2015
path = files[year]
all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers_full

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["total_scheduled"] = table["total_scheduled"].astype('Int64')
table["total_done"] = table["total_done"].astype('Int64')
table['total_rate'] = table['total_rate'].apply(parse_rate)

#quality control
summary = table.tail(1) 
table=table.iloc[:-1,:]

table['k'] = table['name_raw'].apply(name_to_keyword)
table2015 = table.set_index('k')
summary2015 = summary

#####################################

assert(summary2015["d_done"].values[0] == table2015["d_done"].sum())
assert(summary2015["t_done"].values[0] == table2015["t_done"].sum())
assert(summary2015["d_scheduled"].values[0] == table2015["d_scheduled"].sum())
assert(summary2015["t_scheduled"].values[0] == table2015["t_scheduled"].sum())

table2015[['d_done', 't_done', 'd_scheduled', 't_scheduled']] = table2015[['d_done', 't_done', 'd_scheduled', 't_scheduled']].fillna(0)
table2015[['d_rate', 't_rate']] = table2015[['d_rate', 't_rate']].fillna(1)

table2015['total_passed'] = table2015['d_done']*table2015['d_rate'] + table2015['t_done']*table2015['t_rate']
total_passed = table2015['total_passed'].sum() / (table2015["d_done"].sum() + table2015["t_done"].sum())
assert(round(total_passed, 4) == summary2015["total_rate"].values[0])

In [823]:
year=2016
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table.columns = headers_full

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["total_scheduled"] = table["total_scheduled"].astype('Int64')
table["total_done"] = table["total_done"].astype('Int64')
table['total_rate'] = table['total_rate'].apply(parse_rate)

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2016 = table.set_index('k')
summary2016 = summary

assert(summary2016["d_done"].values[0] == table2016["d_done"].sum())
assert(summary2016["t_done"].values[0] == table2016["t_done"].sum())
assert(summary2016["d_scheduled"].values[0] == table2016["d_scheduled"].sum())
assert(summary2016["t_scheduled"].values[0] == table2016["t_scheduled"].sum())

table2016[['d_done', 't_done', 'd_scheduled', 't_scheduled']] = table2016[['d_done', 't_done', 'd_scheduled', 't_scheduled']].fillna(0)
table2016[['d_rate', 't_rate']] = table2016[['d_rate', 't_rate']].fillna(1)

table2016['total_passed'] = table2016['d_done']*table2016['d_rate'] + table2016['t_done']*table2016['t_rate']
total_passed = table2016['total_passed'].sum() / (table2016["d_done"].sum() + table2016["t_done"].sum())
assert(round(total_passed, 4) == summary2016["total_rate"].values[0])

In [825]:
year=2017
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table.columns = headers_full

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["total_scheduled"] = table["total_scheduled"].astype('Int64')
table["total_done"] = table["total_done"].astype('Int64')
table['total_rate'] = table['total_rate'].apply(parse_rate)

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2017 = table.set_index('k')
summary2017 = summary
assert(summary2017["d_done"].values[0] == table2017["d_done"].sum())
assert(summary2017["t_done"].values[0] == table2017["t_done"].sum())
assert(summary2017["d_scheduled"].values[0] == table2017["d_scheduled"].sum())
assert(summary2017["t_scheduled"].values[0] == table2017["t_scheduled"].sum())

table2017[['d_done', 't_done', 'd_scheduled', 't_scheduled']] = table2017[['d_done', 't_done', 'd_scheduled', 't_scheduled']].fillna(0)
table2017[['d_rate', 't_rate']] = table2017[['d_rate', 't_rate']].fillna(1)

table2017['total_passed'] = table2017['d_done']*table2017['d_rate'] + table2017['t_done']*table2017['t_rate']
total_passed = table2017['total_passed'].sum() / (table2017["d_done"].sum() + table2017["t_done"].sum())
assert(round(total_passed, 4) == summary2017["total_rate"].values[0])

In [691]:
year=2018
path = files[year]
page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[15, 5, 95, 75], columns=[60, 70, 300, 340, 400, 450, 500, 560], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 5, 95, 75], columns=[60, 70, 300, 340, 400, 450, 500, 560], pages='2-23', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table = table.drop([0], axis=1)
table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2018 = table.set_index('k')
summary2018 = summary
assert(summary2018["d_done"].values[0] == table2018["d_done"].sum())
assert(summary2018["t_done"].values[0] == table2018["t_done"].sum())
assert(summary2018["d_scheduled"].values[0] == table2018["d_scheduled"].sum())
assert(summary2018["t_scheduled"].values[0] == table2018["t_scheduled"].sum())

AssertionError: 

In [694]:
print(table2018["t_scheduled"].sum())
print(summary2018["t_scheduled"].values[0])

190326
190325


In [682]:
year=2019
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 0, 95, 77], columns=[35, 600, 800, 885, 1000, 1100, 1200], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[35, 600, 800, 885, 1000, 1100, 1200], pages='2-21', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2019 = table.set_index('k')
summary2019 = summary
assert(summary2019["d_done"].values[0] == table2019["d_done"].sum())
assert(summary2019["t_done"].values[0] == table2019["t_done"].sum())
assert(summary2019["d_scheduled"].values[0] == table2019["d_scheduled"].sum())
assert(summary2019["t_scheduled"].values[0] == table2019["t_scheduled"].sum())

In [661]:
year=2020
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[10, 0, 95, 80], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='2-20', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["t_done"] = table["t_done"].astype('Int64')
table["t_scheduled"] = table["t_scheduled"].astype('Int64')
table['k'] = table['name_raw'].apply(name_to_keyword)

table2020 = table.set_index('k')

# comparing with aggreagates at
# https://www.imt-ip.pt/sites/IMTT/Portugues/EnsinoConducao/taxasdeaprovacao/Documents/ANO%202020/categorias-2020.pdf
# differences are maybe to a missing school 
print(table2020["t_scheduled"].sum(), "aggregates: 189918")
assert(table2020["t_done"].sum() == 175301)
print(table2020["d_scheduled"].sum(), "aggregates: 191123")
print(table2020["d_done"].sum(), "aggregates: 177933")

189913 aggregates: 189918
191114 aggregates: 191123
177924 aggregates: 177933
