In [720]:
import tabula
import pandas as pd
import numpy as np
from unidecode import unidecode
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
    
files = {
    2014: {
        "1-sem": {
            "theory": "Taxasdeaprovacao_1_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_1_Semestre_2014_Praticas_Integradas.pdf"
        },
        "2-sem": {
            "theory": "Taxasdeaprovacao_2_Semestre_2014_Teoricas_Integradas.pdf",
            "driving": "Taxasdeaprovacao_2_Semestre_2014_Praticas_Integradas.pdf"
        }
    },    
    2015: "TaxasApr_2015_Relatorio_Escolas_de_Condução.pdf",
    2016: "TaxasApr_2016_Relatorio_EscolasDeConducao.pdf",
    2017: "TaxasApr_2017_Relatorio_EscolasDeConducao.pdf",
    2018: "TaxasApr_2018_Relatorio_EscolasDeConducao.pdf",
    2019: "EscolasdeCondução-2019.pdf",
    2020: "EscolasdeCondução-2020.pdf"
}

def parse_rate (x):
    
    if x == '#DIV/0!':
        return np.NAN
    elif pd.isna(x):
        return x
    else:
        return float((x.replace(',', '.').replace('%', 'e-2')))
    
def parse_int (x):
    
    if pd.isna(x):
        return np.NAN
    else:
        return np.Int64(x)
    
def name_to_keyword(x):
    s = unidecode(x) \
    .lower() \
    .replace('>', '') \
    .replace('<', '') \
    .replace('(', '') \
    .replace(')', '') \
    .replace(' ', '_') \
    .replace('escola_de_conducao_', '') \
    .replace('escola_do_', '') \
    .replace('/', '_') \
    .replace('-', '_') \
    .replace('___', '_') \
    .replace('__', '_') 

    return s
    
headers = ['n_ec', 'name_raw', 't_scheduled', 't_done', 't_rate', 'd_scheduled', 'd_done', 'd_rate']
headers_full = ['n_ec', 'name_raw', 't_scheduled', 't_done', 't_rate', 'd_scheduled', 'd_done', 'd_rate', 'total_scheduled', 'total_done', 'total_rate']

In [725]:
year=2015
path = files[year]
all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table
# table = table.drop([8,9,10], axis=1)
table.columns = headers_full

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["total_scheduled"] = table["total_scheduled"].astype('Int64')
table["total_done"] = table["total_done"].astype('Int64')
table['total_rate'] = table['total_rate'].apply(parse_rate)
# summary = table.tail(1) #quality control
# table=table.iloc[:-1,:]
# table['k'] = table['name_raw'].apply(name_to_keyword)

# table2015 = table.set_index('k')
# summary2015 = summary
# assert(summary2015["d_done"].values[0] == table2015["d_done"].sum())
# assert(summary2015["t_done"].values[0] == table2015["t_done"].sum())
# assert(summary2015["d_scheduled"].values[0] == table2015["d_scheduled"].sum())
# assert(summary2015["t_scheduled"].values[0] == table2015["t_scheduled"].sum())
table

Unnamed: 0,n_ec,name_raw,t_scheduled,t_done,t_rate,d_scheduled,d_done,d_rate,total_scheduled,total_done,total_rate
0,1.0,Escola De Condução Automovel Club De Portugal,308,282,0.805,332.0,328.0,0.686,640,610,0.741
1,2.0,Escola De Condução Acp,296,289,0.8166,376.0,369.0,0.6938,672,658,0.7477
2,5.0,Escola De Condução «A Popular»,59,57,0.614,44.0,44.0,0.6591,103,101,0.6337
3,6.0,Escola De Condução Automoveis Monumental,101,97,0.7423,108.0,106.0,0.6887,209,203,0.7143
4,7.0,Escola De Condução A Desportiva (Boavista),231,226,0.6195,192.0,186.0,0.7097,423,412,0.6602
5,8.0,Escola De Condução Vitória,218,195,0.641,266.0,246.0,0.5488,484,441,0.5896
6,9.0,Escola De Condução Enal,47,45,0.4222,32.0,31.0,0.6129,79,76,0.5
7,10.0,Escola De Condução Lusitânia De Automobilsmo,124,122,0.8852,208.0,207.0,0.5845,332,329,0.696
8,11.0,Escola De Condução Rodaqui,129,126,0.6349,132.0,131.0,0.7328,261,257,0.6848
9,12.0,Escola De Condução Bastos,134,128,0.6484,192.0,185.0,0.5459,326,313,0.5879


In [717]:
d_passed= table2015.loc['acp'].d_done*table2015.loc['acp'].d_rate
t_passed = table2015.loc['acp'].t_done*table2015.loc['acp'].t_rate
t_done = table2015.loc['acp'].t_done
d_done = table2015.loc['acp'].d_done
(d_passed + t_passed)/(t_done + d_done)
#74.77%

0.7477349544072948

In [715]:
table2015.loc['acp']

n_ec                                    2.0
name_raw             Escola De Condução Acp
t_scheduled                        296
t_done                             289
t_rate                          0.8166
d_scheduled                       376
d_done                            369
d_rate                         0.6938
Name: acp, dtype: object

In [697]:
year=2016
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table = table.drop([8,9,10], axis=1)
table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2016 = table.set_index('k')
summary2016 = summary
assert(summary2016["d_done"].values[0] == table2016["d_done"].sum())
assert(summary2016["t_done"].values[0] == table2016["t_done"].sum())
assert(summary2016["d_scheduled"].values[0] == table2016["d_scheduled"].sum())
assert(summary2016["t_scheduled"].values[0] == table2016["t_scheduled"].sum())

In [696]:
year=2017
path = files[year]

all_pages = tabula.read_pdf(path, stream=True, pages="all", pandas_options={'header': None})
table = pd.concat(all_pages, ignore_index=True, sort=False)
table = table.drop([8,9,10], axis=1)
table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2017 = table.set_index('k')
summary2017 = summary
assert(summary2017["d_done"].values[0] == table2017["d_done"].sum())
assert(summary2017["t_done"].values[0] == table2017["t_done"].sum())
assert(summary2017["d_scheduled"].values[0] == table2017["d_scheduled"].sum())
assert(summary2017["t_scheduled"].values[0] == table2017["t_scheduled"].sum())

In [691]:
year=2018
path = files[year]
page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[15, 5, 95, 75], columns=[60, 70, 300, 340, 400, 450, 500, 560], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 5, 95, 75], columns=[60, 70, 300, 340, 400, 450, 500, 560], pages='2-23', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table = table.drop([0], axis=1)
table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2018 = table.set_index('k')
summary2018 = summary
assert(summary2018["d_done"].values[0] == table2018["d_done"].sum())
assert(summary2018["t_done"].values[0] == table2018["t_done"].sum())
assert(summary2018["d_scheduled"].values[0] == table2018["d_scheduled"].sum())
assert(summary2018["t_scheduled"].values[0] == table2018["t_scheduled"].sum())

AssertionError: 

In [694]:
print(table2018["t_scheduled"].sum())
print(summary2018["t_scheduled"].values[0])

190326
190325


In [682]:
year=2019
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[9, 0, 95, 77], columns=[35, 600, 800, 885, 1000, 1100, 1200], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[35, 600, 800, 885, 1000, 1100, 1200], pages='2-21', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')

summary = table.tail(1) #quality control
table=table.iloc[:-1,:]
table['k'] = table['name_raw'].apply(name_to_keyword)

table2019 = table.set_index('k')
summary2019 = summary
assert(summary2019["d_done"].values[0] == table2019["d_done"].sum())
assert(summary2019["t_done"].values[0] == table2019["t_done"].sum())
assert(summary2019["d_scheduled"].values[0] == table2019["d_scheduled"].sum())
assert(summary2019["t_scheduled"].values[0] == table2019["t_scheduled"].sum())

In [661]:
year=2020
path = files[year]

page1 = tabula.read_pdf(path, guess = False, relative_area=True, area=[10, 0, 95, 80], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='1', stream=True, pandas_options={'header': None})
rest = tabula.read_pdf(path, guess = False, relative_area=True, area=[5, 0, 95, 77], columns=[150, 600, 800, 885, 1000, 1100, 1230, 1300], pages='2-20', stream=True, pandas_options={'header': None})

all_pages = sum([page1, rest], [])
table = pd.concat(all_pages, ignore_index=True, sort=False)

table.columns = headers

table['t_rate'] = table['t_rate'].apply(parse_rate)
table['d_rate'] = table['d_rate'].apply(parse_rate)
table["d_done"] = table["d_done"].astype('Int64')
table["d_scheduled"] = table["d_scheduled"].astype('Int64')
table["t_done"] = table["t_done"].astype('Int64')
table["t_scheduled"] = table["t_scheduled"].astype('Int64')
table['k'] = table['name_raw'].apply(name_to_keyword)

table2020 = table.set_index('k')

# comparing with aggreagates at
# https://www.imt-ip.pt/sites/IMTT/Portugues/EnsinoConducao/taxasdeaprovacao/Documents/ANO%202020/categorias-2020.pdf
# differences are maybe to a missing school 
print(table2020["t_scheduled"].sum(), "aggregates: 189918")
assert(table2020["t_done"].sum() == 175301)
print(table2020["d_scheduled"].sum(), "aggregates: 191123")
print(table2020["d_done"].sum(), "aggregates: 177933")

189913 aggregates: 189918
191114 aggregates: 191123
177924 aggregates: 177933
