In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, urlretrieve, urlparse, urljoin
import re, os, tabula, pandas
from tqdm.notebook import tqdm
from datetime import date

### Build a list of files

In [None]:
html = urlopen("https://na.gov.pk/en/attendance2.php").read()
soup = BeautifulSoup(html)

documents = {}
for link in soup.findAll('a', attrs={'style': 'font-size:14px; color:#0000FF;', 'target': '_blank'}):
    name = link.text
    path = urljoin('https://na.gov.pk/en/', link.get('href'))
    documents[name] = path
documents

### Download PDF documents

In [None]:
for name, path in tqdm(documents.items()):
    urlretrieve(path, f"pdf/{name}.pdf")

### Extract data from PDF files to CSV 

In [3]:
def process(index:int, df:pandas.DataFrame):
    if index == 0:
        return df.T.reset_index().T.iloc[3:]
    else:
        return df.T.reset_index().T.reset_index(drop=True)

In [None]:
for document in tqdm(documents.keys()):
    dfs = tabula.read_pdf(f"pdf/{document}.pdf", pages='all')
    dfs = [process(ix, df) for ix, df in enumerate(dfs)]
    final = pandas.concat(dfs)
    final = final.rename(columns={0: 'Serial', 1: 'Constituency', 2: 'Name', 3: 'Attendence'})
    final.to_csv(f'data/{document}.csv', index=False)

### Combine CSV files into one

In [None]:
CSV = [f"data/{document}.csv" for document in documents.keys()]

dfs = []
for document in tqdm(CSV):
    df = pandas.read_csv(document)
    df['Date'] = document
    dfs.append(df)

pandas.concat(dfs, ignore_index=True).to_csv('attendence.csv')