In [15]:
import seaborn as sns
import pdfplumber
from pandas import DataFrame
from keybert import KeyBERT

In [10]:
text = []
with pdfplumber.open(r'150723_Kenya_First NDC.pdf') as pdf:
    for page in pdf.pages:
        text.append(page.extract_text())
        
print(len(text))

7


In [32]:
text_str = ' '.join([page for page in text])
text_500 = ' '.join([word for word in text_str.split()[:500]]) 

In [34]:
print(len(text_str),len(text_str.split()),len(text_500.split()))

16524 2256 500


In [12]:
kw_model = KeyBERT()

In [39]:
keywords = kw_model.extract_keywords(
            text_str,
            keyphrase_ngram_range=(1, 2),
            use_mmr=True,
            stop_words="english",
            top_n=10,
            diversity=0.5,
            )

In [36]:
df = (DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
                .sort_values(by="Relevancy", ascending=False)
                .reset_index(drop=True)   )
df.index += 1

In [37]:
# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
    cmap=cmGreen,
    subset=[
        "Relevancy",
    ],
)

In [38]:

format_dictionary = {
    "Relevancy": "{:.1%}",
}

df = df.format(format_dictionary)
df

Unnamed: 0,Keyword/Keyphrase,Relevancy
1,change kenya,58.4%
2,impacts climate,47.5%
3,national adaptation,40.8%
4,natural resources,39.9%
5,change unfccc,31.4%
6,indc response,30.5%
7,stakeholder cross,30.0%
8,policies plans,27.6%
9,nccap 2013,23.8%
10,framework convention,17.2%


In [40]:
from transformers import pipeline 

finetuned_checkpoint = "peter2000/roberta-base-finetuned-osdg"
classifier = pipeline("text-classification", model=finetuned_checkpoint)


Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [42]:
word_list = text_str.split()
len_word_list = len(word_list)
    

In [52]:
par_list = []
par_len = 150
for i in range(0,len_word_list // par_len):
    string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
    par_list.append(string_part)

In [53]:
par_list

['MINISTRY OF ENVIRONMENT AND NATURAL RESOURCES Kenya’s Intended Nationally Determined Contribution (INDC) 23 July 2015 1. Introduction Kenya, like other countries in the region, is bearing the brunt of climate change impacts and the associated socio-economic losses. The situation is exacerbated by the high dependence on climate sensitive natural resources. In response to the challenges posed by Climate Change, Kenya has developed a National Climate Change Response Strategy (NCCRS 2010), National Climate Change Action Plan (NCCAP 2013), and a National Adaptation Plan (NAP) - under preparation which provides a vision for low carbon and climate resilient development pathway, while a National Climate Change Framework Policy and legislation are in their final stages of enactment to facilitate effective response to climate change. Kenya is operationalising these policies and plans through the implementation of climate change actions in various areas such as afforestation and reforestation, 

In [54]:
classifier(par_list)

[{'label': 'sdg_13', 'score': 0.9833493232727051},
 {'label': 'sdg_13', 'score': 0.9780138731002808},
 {'label': 'sdg_13', 'score': 0.956961989402771},
 {'label': 'sdg_13', 'score': 0.9745646119117737},
 {'label': 'sdg_13', 'score': 0.9198050498962402},
 {'label': 'sdg_13', 'score': 0.6893544793128967},
 {'label': 'sdg_13', 'score': 0.6445251107215881},
 {'label': 'sdg_13', 'score': 0.8096828460693359},
 {'label': 'sdg_13', 'score': 0.9861595034599304},
 {'label': 'sdg_13', 'score': 0.9820348024368286},
 {'label': 'sdg_13', 'score': 0.986470639705658},
 {'label': 'sdg_13', 'score': 0.9632980823516846},
 {'label': 'sdg_13', 'score': 0.9878729581832886},
 {'label': 'sdg_13', 'score': 0.9856063723564148},
 {'label': 'sdg_13', 'score': 0.9831697940826416}]

In [1]:
'''from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open('150723_Kenya_First NDC.pdf', 'rb') as fh:

    for page in PDFPage.get_pages(fh,
                                  caching=True,
                                  check_extractable=True):
        page_interpreter.process_page(page)

    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)'''

"from pdfminer3.layout import LAParams, LTTextBox\nfrom pdfminer3.pdfpage import PDFPage\nfrom pdfminer3.pdfinterp import PDFResourceManager\nfrom pdfminer3.pdfinterp import PDFPageInterpreter\nfrom pdfminer3.converter import PDFPageAggregator\nfrom pdfminer3.converter import TextConverter\nimport io\n\nresource_manager = PDFResourceManager()\nfake_file_handle = io.StringIO()\nconverter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())\npage_interpreter = PDFPageInterpreter(resource_manager, converter)\n\nwith open('150723_Kenya_First NDC.pdf', 'rb') as fh:\n\n    for page in PDFPage.get_pages(fh,\n                                  caching=True,\n                                  check_extractable=True):\n        page_interpreter.process_page(page)\n\n    text = fake_file_handle.getvalue()\n\n# close open handles\nconverter.close()\nfake_file_handle.close()\n\nprint(text)"

In [2]:
#convert_pdf_to_txt('150723_Kenya_First NDC.pdf')