In [2]:
import pandas as pd
import pdb
import urllib.request
from io import StringIO, BytesIO
import re

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTTextBoxHorizontal

In [None]:
raw_unfccc_data = pd.read_csv("unfccc_data.csv")
raw_unfccc_data.head(2)

In [None]:
len(raw_unfccc_data)

In [None]:
# sanitize links
pd.set_option("display.max_colwidth", 300)

raw_unfccc_data["document_url"] = raw_unfccc_data["document_url"].str.replace(" ", "%20")
links = raw_unfccc_data["document_url"]
links

## Extract PDF text from `document_url`

In [None]:
headers = {
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}

In [None]:
def get_beginning_of_pdf(link):
    request = urllib.request.Request(link, headers=headers)
    raw_pdf =  urllib.request.urlopen(request)
    parser = PDFParser(raw_pdf)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    
    content = [line.strip().replace("\n", " ") for line in extracted_text.split(" ") if line != "\n"]
    return " ".join(content[:500])

In [None]:
timeout_links = links.iloc[260:270]
timeout_links

In [None]:
summaries = []

# skip 260 to 270 because of weird timeout issues

for pdf_link in links:
    try:
        if pdf_link not in timeout_links:
            summaries.append(get_beginning_of_pdf(pdf_link))
        else:
            summaries.append("Unable to parse")
    except:
        summaries.append("Unable to parse")

In [None]:
pd.Series(summaries).to_csv("all_summaries.csv")

In [None]:
pd.Series(summaries).value_counts()[:2]

## concat data

In [None]:
timeouts = pd.DataFrame(["Unable to parse" for i in range(0, 10)])
    
raw_1 = pd.read_csv("summaries1.csv", header=None)
raw_2 = pd.read_csv("summaries2.csv", header=None)
raw_3 = pd.read_csv("summaries3.csv", header=None)
raw_4 = pd.read_csv("summaries4.csv", header=None)
raw_summaries = pd.concat([raw_1, raw_2, timeouts, raw_3, raw_4])
del raw_summaries[raw_summaries.columns[0]]
raw_summaries.columns = ["summary"]
raw_summaries.head()

In [None]:
summaries = raw_summaries.copy()
summaries["document_url"] = links
summaries.head()

In [None]:
summaries.shape

In [None]:
unfccc_data = raw_unfccc_data.copy()
del unfccc_data[unfccc_data.columns[0]]
unfccc_data["document_url"] = links # to account for bad links
unfccc_data.head(1)

In [None]:
unfccc_data = unfccc_data.merge(summaries, on="document_url", how="left")
unfccc_data.sample(2)