In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def get_pdf(url, sub_dir, doc_id):
    response = requests.get(url)
    file = open(f"pdf_files/{sub_dir}/{doc_id}.pdf", "wb")
    file.write(response.content)
    file.close()

### Bills

In [3]:
with open("webpages/National Assembly _ Federal Republic of Nigeria.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [4]:
print(soup)

<!DOCTYPE html>

<!-- saved from url=(0035)https://nass.gov.ng/documents/bills -->


In [5]:
data = []
table = soup.find('table', attrs={'class':'table table-striped table-bordered dataTable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [6]:
len(rows)

445

In [7]:
cols = rows[0].find_all('td')
print(cols)
print(cols[0].text)

[<td><a href="https://nass.gov.ng/documents/bill/11071">A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) </a></td>, <td></td>, <td></td>, <td></td>, <td></td>, <td></td>]
A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) 


In [8]:
for row in rows:
    cols = row.find_all('td')
    url = str(cols[0].find('a')['href'])
    _id = url.split('/')[-1]
    data.append({
        'title': cols[0].text,
        'url': url,
        'metadata': {
            'chamber': str(cols[1].text),
            'first_reading': str(cols[2].text),
            'second_reading': str(cols[3].text),
            'commitee_referred': str(cols[4].text),
            'third_reading': str(cols[5].text),
            'download_url': f'https://nass.gov.ng/documents/billdownload/{_id}.pdf',
            'doc_id': _id
        },
        'doc_type': 'bills'
    })

In [9]:
data[0]

{'title': 'A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) ',
 'url': 'https://nass.gov.ng/documents/bill/11071',
 'metadata': {'chamber': '',
  'first_reading': '',
  'second_reading': '',
  'commitee_referred': '',
  'third_reading': '',
  'download_url': 'https://nass.gov.ng/documents/billdownload/11071.pdf',
  'doc_id': '11071'},
 'doc_type': 'bills'}

In [10]:
bills_df = pd.DataFrame(data)

In [11]:
bills_df.shape

(445, 4)

In [12]:
bills_df.head()

Unnamed: 0,title,url,metadata,doc_type
0,A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFOR...,https://nass.gov.ng/documents/bill/11071,"{'chamber': '', 'first_reading': '', 'second_r...",bills
1,"FEDERAL COLLEGE OF AGRICULTURE ISE-ORUN, EKITI...",https://nass.gov.ng/documents/bill/11052,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
2,CASSAVA FLOUR (MANDATORY INCLUSION IN FLOUR PR...,https://nass.gov.ng/documents/bill/11050,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
3,A BILL FOR AN ACT TO ESTABLISH THE FEDERAL COL...,https://nass.gov.ng/documents/bill/11053,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
4,A Bill for an Act to Repeal the Patents and De...,https://nass.gov.ng/documents/bill/11067,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills


In [13]:
bills_df['title'] = bills_df['title'].str.lower()

In [14]:
bills_df.metadata.iloc[0]

{'chamber': '',
 'first_reading': '',
 'second_reading': '',
 'commitee_referred': '',
 'third_reading': '',
 'download_url': 'https://nass.gov.ng/documents/billdownload/11071.pdf',
 'doc_id': '11071'}

In [15]:
bills_df.head()

Unnamed: 0,title,url,metadata,doc_type
0,a bill for an act to repeal the produce (enfor...,https://nass.gov.ng/documents/bill/11071,"{'chamber': '', 'first_reading': '', 'second_r...",bills
1,"federal college of agriculture ise-orun, ekiti...",https://nass.gov.ng/documents/bill/11052,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
2,cassava flour (mandatory inclusion in flour pr...,https://nass.gov.ng/documents/bill/11050,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
3,a bill for an act to establish the federal col...,https://nass.gov.ng/documents/bill/11053,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills
4,a bill for an act to repeal the patents and de...,https://nass.gov.ng/documents/bill/11067,"{'chamber': 'Senate', 'first_reading': '', 'se...",bills


In [16]:
bills_df.metadata.iloc[0]['download_url']

'https://nass.gov.ng/documents/billdownload/11071.pdf'

**Download PDFs**

In [None]:
bills_df['download_url'].apply(get_pdf)

### Hansard

In [17]:
with open("webpages/National Assembly _ Federal Republic of Nigeria-hansard.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [18]:
print(soup)

<!DOCTYPE html>

<!-- saved from url=(0037)https://nass.gov.ng/documents/hansard -->
</a></td><td>2009-07-16</td><td>Senate</td><td>9th Parliament</td><td>1st Session</td></tr><tr class="odd" role="row"><td><a href="https://nass.gov.ng/documents/download/6130">SENATE HANSARD FOR TUESDAY 21ST JULY, 2009</a></td><td>2009-07-21</td><td>Senate</td><td>9th Parliament</td><td>1st Session</td></tr><tr class="even" role="row"><td><a href="https://nass.gov.ng/documents/download/6131">SENATE HANSARD FOR WEDNESDAY 22ND JULY, 2009</a></td><td>2009-07-22</td><td>Senate</td><td>9th Parliament</td><td>1st Session</td></tr><tr class="odd" role="row"><td><a href="https://nass.gov.ng/documents/download/6132">SENATE HANSARD FOR WEDNESDAY 29TH JULY, 2009</a></td><td>2009-07-29</td><td>Senate</td><td>9th Parliament</td><td>1st Session</td></tr><tr class="even" role="row"><td><a href="https://nass.gov.ng/documents/download/6133">SENATE HANSARD FOR THURSDAY 30TH JULY, 2009</a></td><td>2009-07-30</td><td>Sena

In [19]:
data = []
table = soup.find('table', attrs={'class':'table table-striped table-bordered dataTable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [20]:
len(rows)

824

In [21]:
cols = rows[0].find_all('td')
print(cols)
print(cols[0].text)

[<td><a href="https://nass.gov.ng/documents/download/5975"><font size="1"><font face="arial"><font face="arial">Senate</font> </font><font face="arial">Hansard for Thusrday 1st July, 2010</font></font></a></td>, <td>2010-07-01</td>, <td>Senate</td>, <td>9th Parliament</td>, <td>1st Session</td>]
Senate Hansard for Thusrday 1st July, 2010


In [22]:
for row in rows:
    cols = row.find_all('td')
    url = str(cols[0].find('a')['href'])
    _id = url.split('/')[-1]
    data.append({
        'title': cols[0].text,
        'url': url,
        'metadata': {
            'chamber': str(cols[2].text),
            'document_date': str(cols[1].text),
            'parliament': str(cols[3].text),
            'session': str(cols[4].text),
            'download_url': url,
            'doc_id': _id
        },
        'doc_type': 'hansard'
    })

In [23]:
data[0]

{'title': 'Senate Hansard for Thusrday 1st July, 2010',
 'url': 'https://nass.gov.ng/documents/download/5975',
 'metadata': {'chamber': 'Senate',
  'document_date': '2010-07-01',
  'parliament': '9th Parliament',
  'session': '1st Session',
  'download_url': 'https://nass.gov.ng/documents/download/5975',
  'doc_id': '5975'},
 'doc_type': 'hansard'}

In [24]:
hansard_df = pd.DataFrame(data)

In [27]:
hansard_df.shape, bills_df.shape

((824, 4), (445, 4))

In [28]:
hansard_df.head()

Unnamed: 0,title,url,metadata,doc_type
0,"Senate Hansard for Thusrday 1st July, 2010",https://nass.gov.ng/documents/download/5975,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
1,"SENATE HANSARD FOR WEDNESDAY 30TH JUNE, 2010",https://nass.gov.ng/documents/download/5976,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
2,"SENATE HANSARD FOR TUESDAY 29TH JUNE, 2010",https://nass.gov.ng/documents/download/5977,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
3,"senate hansard for thursday 24th june, 2010",https://nass.gov.ng/documents/download/5978,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
4,"senate hansard for thursday 3rd june, 2010",https://nass.gov.ng/documents/download/5979,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard


In [29]:
hansard_df['title'] = hansard_df['title'].str.lower()

In [30]:
hansard_df.metadata.iloc[0]

{'chamber': 'Senate',
 'document_date': '2010-07-01',
 'parliament': '9th Parliament',
 'session': '1st Session',
 'download_url': 'https://nass.gov.ng/documents/download/5975',
 'doc_id': '5975'}

In [31]:
hansard_df.head()

Unnamed: 0,title,url,metadata,doc_type
0,"senate hansard for thusrday 1st july, 2010",https://nass.gov.ng/documents/download/5975,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
1,"senate hansard for wednesday 30th june, 2010",https://nass.gov.ng/documents/download/5976,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
2,"senate hansard for tuesday 29th june, 2010",https://nass.gov.ng/documents/download/5977,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
3,"senate hansard for thursday 24th june, 2010",https://nass.gov.ng/documents/download/5978,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard
4,"senate hansard for thursday 3rd june, 2010",https://nass.gov.ng/documents/download/5979,"{'chamber': 'Senate', 'document_date': '2010-0...",hansard


In [32]:
hansard_df.metadata.iloc[0]['download_url']

'https://nass.gov.ng/documents/download/5975'

**Download PDFs**

In [34]:
hansard_df.apply(lambda row: get_pdf(row['metadata']['download_url'], 'hansard', row['metadata']['doc_id']), axis=1)

KeyboardInterrupt: 

### Order Paper

In [2]:
with open("National Assembly _ Federal Republic of Nigeria.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [3]:
print(soup)

<!DOCTYPE html>

<!-- saved from url=(0035)https://nass.gov.ng/documents/bills -->


In [4]:
col_names = ['title','url','chamber','first_reading','second_reading','commitee_referred','third_reading']

In [6]:
data = []
table = soup.find('table', attrs={'class':'table table-striped table-bordered dataTable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [7]:
len(rows)

445

In [18]:
cols = rows[0].find_all('td')
print(cols)
print(cols[0].text)

[<td><a href="https://nass.gov.ng/documents/bill/11071">A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) </a></td>, <td></td>, <td></td>, <td></td>, <td></td>, <td></td>]
A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) 


In [19]:
for row in rows:
    cols = row.find_all('td')
    data.append({
        'title': cols[0].text,
        'url': str(cols[0].find('a')['href']),
        'chamber': str(cols[1].text),
        'first_reading': str(cols[2].text),
        'second_reading': str(cols[3].text),
        'commitee_referred': str(cols[4].text),
        'third_reading': str(cols[5].text)
    })

In [20]:
data[0]

{'title': 'A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) ',
 'url': 'https://nass.gov.ng/documents/bill/11071',
 'chamber': '',
 'first_reading': '',
 'second_reading': '',
 'commitee_referred': '',
 'third_reading': ''}

In [21]:
import pandas as pd

In [22]:
bills_df = pd.DataFrame(data)

In [23]:
bills_df.shape

(445, 7)

In [24]:
bills_df.head()

Unnamed: 0,title,url,chamber,first_reading,second_reading,commitee_referred,third_reading
0,A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFOR...,https://nass.gov.ng/documents/bill/11071,,,,,
1,"FEDERAL COLLEGE OF AGRICULTURE ISE-ORUN, EKITI...",https://nass.gov.ng/documents/bill/11052,Senate,,,,
2,CASSAVA FLOUR (MANDATORY INCLUSION IN FLOUR PR...,https://nass.gov.ng/documents/bill/11050,Senate,,,,
3,A BILL FOR AN ACT TO ESTABLISH THE FEDERAL COL...,https://nass.gov.ng/documents/bill/11053,Senate,,,,
4,A Bill for an Act to Repeal the Patents and De...,https://nass.gov.ng/documents/bill/11067,Senate,,,,


In [25]:
bills_df['bill_id'] = bills_df['url'].apply(lambda x: x.split('/')[-1])

In [26]:
bills_df.bill_id.nunique()

445

In [27]:
bills_df['download_url'] = bills_df['bill_id'].apply(lambda x: f'https://nass.gov.ng/documents/billdownload/{x}.pdf')

In [28]:
import requests

In [30]:
bills_df.head()

Unnamed: 0,title,url,chamber,first_reading,second_reading,commitee_referred,third_reading,bill_id,download_url
0,A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFOR...,https://nass.gov.ng/documents/bill/11071,,,,,,11071,https://nass.gov.ng/documents/billdownload/110...
1,"FEDERAL COLLEGE OF AGRICULTURE ISE-ORUN, EKITI...",https://nass.gov.ng/documents/bill/11052,Senate,,,,,11052,https://nass.gov.ng/documents/billdownload/110...
2,CASSAVA FLOUR (MANDATORY INCLUSION IN FLOUR PR...,https://nass.gov.ng/documents/bill/11050,Senate,,,,,11050,https://nass.gov.ng/documents/billdownload/110...
3,A BILL FOR AN ACT TO ESTABLISH THE FEDERAL COL...,https://nass.gov.ng/documents/bill/11053,Senate,,,,,11053,https://nass.gov.ng/documents/billdownload/110...
4,A Bill for an Act to Repeal the Patents and De...,https://nass.gov.ng/documents/bill/11067,Senate,,,,,11067,https://nass.gov.ng/documents/billdownload/110...


In [32]:
bills_df.download_url.iloc[0]

'https://nass.gov.ng/documents/billdownload/11071.pdf'

### Votes and Procedings

In [2]:
with open("National Assembly _ Federal Republic of Nigeria.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [3]:
print(soup)

<!DOCTYPE html>

<!-- saved from url=(0035)https://nass.gov.ng/documents/bills -->


In [4]:
col_names = ['title','url','chamber','first_reading','second_reading','commitee_referred','third_reading']

In [6]:
data = []
table = soup.find('table', attrs={'class':'table table-striped table-bordered dataTable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [7]:
len(rows)

445

In [18]:
cols = rows[0].find_all('td')
print(cols)
print(cols[0].text)

[<td><a href="https://nass.gov.ng/documents/bill/11071">A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) </a></td>, <td></td>, <td></td>, <td></td>, <td></td>, <td></td>]
A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) 


In [19]:
for row in rows:
    cols = row.find_all('td')
    data.append({
        'title': cols[0].text,
        'url': str(cols[0].find('a')['href']),
        'chamber': str(cols[1].text),
        'first_reading': str(cols[2].text),
        'second_reading': str(cols[3].text),
        'commitee_referred': str(cols[4].text),
        'third_reading': str(cols[5].text)
    })

In [20]:
data[0]

{'title': 'A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFORCEMENT OF EXPORT STANDARDS) ACT, CAP. P32 LFN, 2004 AND ENACT THE FEDERAL PRODUCE INSPECTION SERVICE (ESTABLISHMENT, ENFORCEMENT OF EXPORT STANDARDS) ACT, TO PROVIDE FOR THE INSPECTION AND ENFORCEMENT OF GRADES AND QUALITY STANDARDS OF PRODUCE AND COMMODITIES INTENDED FOR IMPORT INTO OR EXPORT FROM NIGERIA AT PORTS OF SHIPMENT AND FOR RELATED MATTERS (SB. 1079) ',
 'url': 'https://nass.gov.ng/documents/bill/11071',
 'chamber': '',
 'first_reading': '',
 'second_reading': '',
 'commitee_referred': '',
 'third_reading': ''}

In [21]:
import pandas as pd

In [22]:
bills_df = pd.DataFrame(data)

In [23]:
bills_df.shape

(445, 7)

In [24]:
bills_df.head()

Unnamed: 0,title,url,chamber,first_reading,second_reading,commitee_referred,third_reading
0,A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFOR...,https://nass.gov.ng/documents/bill/11071,,,,,
1,"FEDERAL COLLEGE OF AGRICULTURE ISE-ORUN, EKITI...",https://nass.gov.ng/documents/bill/11052,Senate,,,,
2,CASSAVA FLOUR (MANDATORY INCLUSION IN FLOUR PR...,https://nass.gov.ng/documents/bill/11050,Senate,,,,
3,A BILL FOR AN ACT TO ESTABLISH THE FEDERAL COL...,https://nass.gov.ng/documents/bill/11053,Senate,,,,
4,A Bill for an Act to Repeal the Patents and De...,https://nass.gov.ng/documents/bill/11067,Senate,,,,


In [25]:
bills_df['bill_id'] = bills_df['url'].apply(lambda x: x.split('/')[-1])

In [26]:
bills_df.bill_id.nunique()

445

In [27]:
bills_df['download_url'] = bills_df['bill_id'].apply(lambda x: f'https://nass.gov.ng/documents/billdownload/{x}.pdf')

In [28]:
import requests

In [30]:
bills_df.head()

Unnamed: 0,title,url,chamber,first_reading,second_reading,commitee_referred,third_reading,bill_id,download_url
0,A BILL FOR AN ACT TO REPEAL THE PRODUCE (ENFOR...,https://nass.gov.ng/documents/bill/11071,,,,,,11071,https://nass.gov.ng/documents/billdownload/110...
1,"FEDERAL COLLEGE OF AGRICULTURE ISE-ORUN, EKITI...",https://nass.gov.ng/documents/bill/11052,Senate,,,,,11052,https://nass.gov.ng/documents/billdownload/110...
2,CASSAVA FLOUR (MANDATORY INCLUSION IN FLOUR PR...,https://nass.gov.ng/documents/bill/11050,Senate,,,,,11050,https://nass.gov.ng/documents/billdownload/110...
3,A BILL FOR AN ACT TO ESTABLISH THE FEDERAL COL...,https://nass.gov.ng/documents/bill/11053,Senate,,,,,11053,https://nass.gov.ng/documents/billdownload/110...
4,A Bill for an Act to Repeal the Patents and De...,https://nass.gov.ng/documents/bill/11067,Senate,,,,,11067,https://nass.gov.ng/documents/billdownload/110...


In [32]:
bills_df.download_url.iloc[0]

'https://nass.gov.ng/documents/billdownload/11071.pdf'

In [29]:
def get_pdf(url):
    bill_id = url.split('/')[-1].split('.')[0]
    response = requests.get(url)
    file = open(f"pdf_files/{bill_id}.pdf", "wb")
    file.write(response.content)
    file.close()

In [33]:
get_pdf('https://nass.gov.ng/documents/billdownload/11071.pdf')

In [34]:
bills_df['download_url'].apply(get_pdf)

0      None
1      None
2      None
3      None
4      None
       ... 
440    None
441    None
442    None
443    None
444    None
Name: download_url, Length: 445, dtype: object

## Persist to Disk

As a first step to persisting our corpus,
let's save it to disk and reload it.

The data involved is relatively simple --
basically all strings --
so we don't need to `pickle` the `DataFrame`,
which comes with its own woes.

Instead, we just format it as `JSON` --
the web's favorite serialization format.

In [35]:
bills_json = bills_df.to_json(orient="index", index=True)
with open("nass_documents.json", "w") as f:
    f.write(bills_json)

In [36]:
import json

with open("nass_documents.json") as f:
    s = f.read()
    
key, document = list(json.loads(s).items())[0]

In [37]:
print(document["bill_id"], document['download_url'])

11071 https://nass.gov.ng/documents/billdownload/11071.pdf


## Put into MongoDB

But a local filesystem isn't a good method for persistence.

We want these documents to be available via an API,
with the ability to scale reads and writes if needed.

So let's put them in a database.

We choose MongoDB simply for convenience --
we don't want to define a schema just yet,
since these tools are evolving rapidly,
and there are nice free hosting options.

> MongoDB is, in NoSQL terms, a "document database",
but the term document means something different
than it does in "Document Q&A".
In Mongoland, a "document" is just a blob of JSON.
We format our Q&A documents as JSON
and store them in Mongo,
so the distinction is not obvious here.

If you're running this yourself,
you'll need to create a hosted MongoDB instance
and add a database called `fsdl`
with a collection called `ask-fsdl`.

You can find instructions
[here](https://www.mongodb.com/basics/mongodb-atlas-tutorial).

You'll need the URL and password info
from that setup process to connect.

Add them to the `.env` file.

In [4]:
!pip install pymongo

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
import json
import os

from dotenv import load_dotenv
import pymongo
from pymongo import InsertOne

load_dotenv()

mongodb_url = os.environ["MONGODB_URI"]
mongodb_user = os.environ["MONGODB_USER"]
mongodb_password = os.environ["MONGODB_PASSWORD"]

In [4]:
mongodb_url

'cluster0.duqv1vl.mongodb.net'

In [5]:
CONNECTION_STRING = f"mongodb+srv://{mongodb_user}:{mongodb_password}@{mongodb_url}/?retryWrites=true&w=majority"

# connect to the database server
client = pymongo.MongoClient(CONNECTION_STRING)
# connect to the database
db = client.get_database("nass_bot")
# get a representation of the collection
collection = db.get_collection("corpus")

collection

Collection(Database(MongoClient(host=['ac-dumwpdp-shard-00-00.duqv1vl.mongodb.net:27017', 'ac-dumwpdp-shard-00-02.duqv1vl.mongodb.net:27017', 'ac-dumwpdp-shard-00-01.duqv1vl.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-mxcabh-shard-0', tls=True), 'nass_bot'), 'corpus')

In [53]:
document

{'title': 'Federal Character Commission Act (Amendment) Bill, 2019 (HB. 228) ',
 'url': 'https://nass.gov.ng/documents/bill/10595',
 'chamber': 'House of Representatives',
 'first_reading': '2019-07-23',
 'second_reading': '',
 'commitee_referred': '',
 'third_reading': '',
 'bill_id': '10595',
 'download_url': 'https://nass.gov.ng/documents/billdownload/10595.pdf',
 'metadata': {'bill_id': '10595',
  'document_url': 'https://nass.gov.ng/documents/billdownload/10595.pdf'},
 '_id': ObjectId('646e381ea707b37ed6576238')}

In [54]:
CHUNK_SIZE = 150
requesting = []

with open("nass_documents.json") as f:
    documents = json.load(f)


for (idx,document) in documents.items():
    metadata = {
        "bill_id": document['bill_id'],
        "document_url": document['download_url']
    }
    document = {**document, **{"metadata": metadata, "doc_type":"bills"}}

    requesting.append(InsertOne(document))
    
    if len(requesting) >= CHUNK_SIZE:
        collection.bulk_write(requesting)
        requesting = []
        
if requesting:
    collection.bulk_write(requesting)
    requesting = []

In [55]:
!pip install pdfplumber

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pdfplumber
  Downloading pdfplumber-0.9.0-py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 192 kB/s eta 0:00:01
[?25hCollecting Wand>=0.6.10
  Downloading Wand-0.6.11-py2.py3-none-any.whl (143 kB)
[K     |████████████████████████████████| 143 kB 230 kB/s eta 0:00:01
[?25hCollecting Pillow>=9.1
  Downloading Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 1.1 MB/s eta 0:00:01
[?25hCollecting pdfminer.six==20221105
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 1.3 MB/s eta 0:00:01
[?25hCollecting cryptography>=36.0.0
  Downloading cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 786 kB/s eta 0:00:01
[?25hCollecting charset-normalizer>=2.0.0
  Downloading charset_normalizer-3.1.0-

In [6]:
import pdfplumber

In [64]:
def get_pdf_text(pdf_path):
    """Extracts text from a PDF file."""

    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages])
    return text

In [66]:
get_pdf_text("pdf_files/10595.pdf")

PDFSyntaxError: No /Root object! - Is this really a PDF?

In [30]:
from io import BytesIO
def get_pdf_text(sub_dir, doc_id):
    """Extracts text from a PDF file."""
    s3 = session.resource('s3')
    obj = s3.Object("nass-bot", f"pdf_files/{sub_dir}/{doc_id}.pdf")
    fs = obj.get()['Body'].read()
    with pdfplumber.open(BytesIO(fs)) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages])
    return text

In [26]:
import os
import boto3

session = boto3.Session(
    aws_access_key_id="AKIASZYL4DRCTAYDSUI6",
    aws_secret_access_key="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",
)

In [35]:
s3 = session.resource('s3')
obj = s3.Object("nass-bot", f"pdf_files/hansard/9210.pdf")

In [36]:
obj.get()

KeyboardInterrupt: 

In [33]:
from pdfminer.pdfparser import PDFSyntaxError

In [37]:
try:
    get_pdf_text("hansard", "9210")
except PDFSyntaxError:
    print("caught!!")

caught!!


In [38]:
get_pdf_text("hansard", "9210")

PDFSyntaxError: No /Root object! - Is this really a PDF?