In this notebook we gather the NDC/LTS documents as provided by Climate Watch (html files) and also original pdf from UNFCC website.

- Subsection (CW): Processes the html file to create the paragraphs based on split_by = 'words', split_length = [60,85,150] and split_overlap depends on country.
- Subsection (IKI): Document downlaod using links info, if fails tries to find document in other date sub-domains of website. Extract text and then create paragraphs.

# Packages Installation

In [None]:
# # %%capture
# # linux packages
!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
!apt-get install tesseract-ocr libtesseract-dev poppler-utils

In [None]:
!pip3 install pycountry
!pip install rank_bm25
!pip install xlsxwriter
!pip install -e "git+https://github.com/gizdatalab/haystack_utils.git@main#egg=utils"
!pip install langdetect

In [None]:
import pandas as pd
import numpy as np
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
import pycountry
from bs4 import BeautifulSoup
from IPython.display import display
import os
import glob
import utils
from utils.preprocessing import UdfPreProcessor
from haystack.schema import Document
from tqdm import tqdm
import requests
import json
import hashlib
import os
import re
import urllib.parse
from pathlib import Path
import ast
import requests

# Processing Functions

In [None]:
# setting up the paths to climate watch data
path_to_step1 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step1/"

# reading the file which contains the response statistics at country level
responsestats = pd.read_json(path_to_step1 + 'output/responsestats.json')

display(responsestats[responsestats.Country == 'Global'])
responseLengthMedian = (responsestats[responsestats.Country == 'Global']['median_val'].values)[0]
print('\n',"Global Median value of Response Length:", responseLengthMedian)

Unnamed: 0,Alpha3,Country,value_count,mean_val,median_val,min_val,max_val,std_val,ninetyfifth_percentile
217,Global,Global,,17.069639,10.0,,85.7875,,



 Global Median value of Response Length: 10.0


In [None]:
def get_split_strategy(country_median):
    """
    builds 3 split_strategy based on Responsetext length stats of country. While
    it is good to have a overlap, however too much of overlap can lead to duplications.
    So we try to restrict the overlap to either 10 words, if the country
    median < 20 or if its more than 20, we limit overlap to 20 words.


    Params
    -----------------
    country_median


    Return
    -----------------
    split_strategy:List[Dict], Dictionary contains split length, overlap, split_by etc

    """
    #
    if country_median < 20:
        split_strategy = [{'split_length':60, "split_by": 'word',
                            'split_overlap':int(responseLengthMedian),
                            "split_respect_sentence_boundary": True},
                          {'split_length':85, "split_by": 'word',
                            'split_overlap':int(responseLengthMedian),
                            "split_respect_sentence_boundary": True},
                          {'split_length':150, "split_by": 'word',
                            'split_overlap':int(responseLengthMedian),
                            "split_respect_sentence_boundary": True}]

    else:
        split_strategy = [{'split_length':60,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True},
                          {'split_length':85,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True},
                          {'split_length':150,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True}]

    return split_strategy

def get_country_split_strategy(country_:None,country_code):
    """
    try to get the country response stats and then calls get_split_strategy to
    fetch splitting strategies, in case of some error returns defined staretgies

    """
    try:
        country_mean  = country_[country_.Alpha3 == country_code].mean_val.values[0]
        country_max = country_[country_.Alpha3 == country_code].max_val.values[0]
        country_median = country_[country_.Alpha3 == country_code].median_val.values[0]

        split_strategy = get_split_strategy(country_mean, country_max, country_median)

        return split_strategy

    except:
        return [{'split_length':60,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True},
                          {'split_length':85,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True},
                          {'split_length':150,"split_by": 'word',
                            'split_overlap':20,
                            "split_respect_sentence_boundary": True}]

def create_paragraphs_text(text_data, split_strategy, language='en', filename= None):
    """
    Takes Raw text data extracted from html files, pdf files etc and splits this
    text into small chunks based on split strategy.

    Params
    ----------------
    text_data: raw text data
    split_strategy: List[Dict], Dictionary contains split length, overlap, split_by etc
                    as requried for paragraph/text_chunks creation from raw text.

    language: language of text. The Preprocessor component allows to pass the
              langauge so that in the backend we can use punkt tokenizer of that
              language to create the paragraphs
    filename: Optional, filename to which the text data belongs to.

    Return
    ---------------
    placeholder:

    """

    # placeholder to collect the paragraphs as per strategy
    placeholder = {}
    # creating the haystack document from text_data
    documents = Document(content=text_data,
                          meta={"name": filename},
                          id_hash_keys=None)

    # starting preprocessing for paragraph creation
    custom_preprocessor = UdfPreProcessor()
    if split_strategy:
        for strategy in split_strategy:
            output = custom_preprocessor.run([documents],
                                split_by = "word",
                                remove_punc = False,
                                split_respect_sentence_boundary = True,
                                split_length = strategy['split_length'],
                                split_overlap = strategy['split_overlap'],
                                language = language)
            passages = output[0]['documents']
            new_para_list = []
            for passage in passages:
                new_para_list.append((passage.content,passage.meta['name'],
                                      passage.meta['page'], passage.id))
            if new_para_list:
                placeholder[str(strategy)] = new_para_list
    return placeholder



def paraLengthCheck(paraList, max_len = 180, keep_hash = True):
    """
    There are cases where preprocessor cannot respect word limit, when using
    respect sentence boundary flag due to missing sentence boundaries.
    Therefore we run one more round of split here for those paragraphs

    Params
    ---------------
    paraList : list of paragraphs/text
    max_len : max length to be respected by sentences which bypassed
              preprocessor strategy

    Return
    ----------
    paragraph_list:

    """

    splits = list(paraList.keys())
    paragraph_list = {}

    for split in splits:
        new_para_list = []
        paragraphs = paraList[split]
        for passage in paragraphs:
            # check if para exceeds words limit
            if len(passage[0].split()) > max_len:
              # we might need few iterations example if para = 512 tokens
              # we need to iterate 5 times to reduce para to size limit of '100'
                iterations = int(len(passage[0].split())/max_len)
                for i in range(iterations):
                    if i == 0:
                        temp  = " ".join(passage[0].split()[max_len*i:max_len*(i+1)])
                    else:
                        # overlap 20 tokens from previous paragraph
                        temp = " ".join(passage[0].split()[(max_len*i) - 20:max_len*(i+1)])
                    new_para_list.append([temp,passage[1],passage[2]])
                temp  = " ".join(passage[0].split()[max_len*(i+1)-20:])

                # Passages
                if keep_hash:
                    new_para_list.append([temp,passage[1],passage[2],passage[3]])
                else:
                    new_para_list.append([temp,passage[1],passage[2]])
            else:
                # paragraphs which dont need any splitting
                if keep_hash:
                    new_para_list.append([passage[0],passage[1],passage[2],passage[3]])
                else:
                    new_para_list.append([passage[0],passage[1],passage[2]])
        paragraph_list[split] = new_para_list
        # logging.info("New paragraphs length {}".format(len(new_para_list)))
    return paragraph_list

In [None]:

# https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
def htmlparser(filepath):
    """
    Reads the html file and returns the text/string

    Params
    ---------
    filepath: path to the html file

    Return
    ------------
    full_text: partially cleaned text from html file

    """
    filetest = open(filepath, "r")
    soup = BeautifulSoup(filetest, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    # get text
    text = soup.get_text()

    # avoiding the text with semicolon to be split across lines
    text = text.replace(":\n",":")
    list_lines = [line.strip() for line in text.splitlines()]

    # break multi-headlines into a line each
    list_chunks = [phrase.strip() for line in list_lines for phrase in line.split("  ")]

    new_list = [chunk for chunk in list_chunks if chunk]


    # print("Total lines without blanks:",len(new_list))

    # creating text from list of lines
    # haystack split for sentences doesnt work if there is no space
    # between sentences therefore adding full-stop with space to join lines.
    full_text = '. '.join(new_list)

    # removing ".." as previous startegy might cause this at some places.
    full_text = full_text.replace("..",".")
    return full_text


def extract_cwinfo(filename_):
    """
    this is to extract some relvant information from CW provided NDC html filenames
    Usual structure of filename is like "AND-revised_first_ndc-EN_TR.html"
    So if we split by "-", we have Country Code, Filename, and Langauge.

    Params
    ---------------
    filename_: filename of html files from CW github repo


    Return
    ------------------
    country_code: ISO Alpha3 code of country
    doc_type: Name of the document type, given as its mostly NDC documents it
              would be like 'First NDC', 'Revised First NDC' etc....
    lang: Langauge code for the document like 'EN', 'ES' etc

    """


    tokens = filename_.split("-")
    if len(tokens) == 4:
        country_code = tokens[0]
        doc_type =  " ".join(tokens[1].upper().split("_"))
        lang = tokens[-2]
    else:
        country_code = tokens[0]
        doc_type =  " ".join(tokens[1].upper().split("_"))
        lang = (tokens[-1].split(".")[0]).split("_")[0]

    return country_code, doc_type, lang.strip().lower()

# CW

In [None]:
# getting ndc html files from climate watch git repo
!git clone https://github.com/WRI-ClimateWatch/ndc /content/drive/MyDrive/Colab\ Notebooks/CPU/Step2/input/cw_ndc/

Cloning into '/content/policyData/ndc'...
remote: Enumerating objects: 4819, done.[K
remote: Counting objects: 100% (759/759), done.[K
remote: Compressing objects: 100% (702/702), done.[K
^C


In [None]:
# path where we cloned the climate watch ndc repo
path_to_ndc = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/input/cw_ndc/"

# read all html files
cw_ndc_files = glob.glob(path_to_ndc+'*.html')

# create dataframe and extract info from filename
cw_ndc = pd.DataFrame(cw_ndc_files, columns=['filepath'])

# extarct filename
cw_ndc['file_name'] = cw_ndc['filepath'].apply(lambda x: os.path.basename(x))

# extract information from filename
cw_ndc['country_code'], cw_ndc['type_of_document'], cw_ndc['language'] = \
                  zip(*cw_ndc.file_name.apply(lambda x: extract_cwinfo(x)))

In [None]:
# Some filenames can not follow the generic structure and we need to edit those entries
cw_ndc[cw_ndc.type_of_document.str.contains('HTML')]

Unnamed: 0,filepath,file_name,country_code,type_of_document,language
158,/content/drive/MyDrive/Colab Notebooks/CPU/Step2/input/cw_ndc/CRI_revised_fi...,CRI_revised_first_ndc-EN_TR.html,CRI_revised_first_ndc,EN TR.HTML,en
159,/content/drive/MyDrive/Colab Notebooks/CPU/Step2/input/cw_ndc/CRI_revised_fi...,CRI_revised_first_ndc-ES.html,CRI_revised_first_ndc,ES.HTML,es


In [None]:
# few manual corrections
cw_ndc.loc[158,'type_of_document'] = 'REVISED FIRST NDC'
cw_ndc.loc[159,'type_of_document'] = 'REVISED FIRST NDC'
cw_ndc.loc[158,'country_code'] = 'CRI'
cw_ndc.loc[159,'country_code'] = 'CRI'

print(cw_ndc.type_of_document.unique())

['FIRST NDC' 'INDC' 'REVISED FIRST NDC' 'SECOND NDC' 'ARCHIVED SECOND NDC'
 'ARCHIVED REVISED FIRST NDC' 'REVISED SECOND NDC']


In [None]:
tqdm.pandas()
# read html file and get full cleaned text
print("Extracting raw text from html files")
cw_ndc['original_text'] = cw_ndc['filepath'].progress_apply(lambda x: htmlparser(x))

Extracting raw text from html files


100%|██████████| 634/634 [00:49<00:00, 12.80it/s]


In [None]:
# create paragraphs from extracted raw text
print("Creating Paragraphs")
cw_ndc['paragraphs'] = cw_ndc.progress_apply(lambda x: create_paragraphs_text(x['original_text'],
                                    get_country_split_strategy(responsestats,x['country_code']),
                                    x['language'], x['file_name']), axis=1)

In [None]:
# do the paragraph check for this we use the default value of 180 words for each
# split type. We do this so that the while training of Models, we dont get error
# due to tensor size mismatch. alternatively you cna also set the truncation =True
# for model tokenizers.
cw_ndc['paragraphs'] = cw_ndc.paragraphs.apply(lambda x: paraLengthCheck(x, keep_hash=False))

# Filetype renaming
cw_ndc.type_of_document.replace({'FIRST NDC': 'First NDC',
                                'REVISED FIRST NDC': 'Revised First NDC',
                                'ARCHIVED REVISED FIRST NDC': 'Revised First NDC (archived)',
                                'SECOND NDC':'Second NDC',
                                'REVISED SECOND NDC':'Revised Second NDC',
                                'ARCHIVED SECOND NDC':'Archived Second NDC'},
                                inplace=True)

In [None]:
# data validations
print('Document types:',cw_ndc.type_of_document.unique(),'\n')
print('country codes:',cw_ndc.country_code.unique(),'\n')
print('languages types:',cw_ndc.language.unique(),'\n')

Document types: ['First NDC' 'INDC' 'Revised First NDC' 'Second NDC' 'Archived Second NDC'
 'Revised First NDC (archived)' 'Revised Second NDC'] 

country codes: ['AFG' 'AGO' 'ALB' 'AND' 'ARE' 'ARG' 'ARM' 'ATG' 'AUS' 'AUT' 'AZE' 'BDI'
 'BEL' 'BEN' 'BFA' 'BGD' 'BGR' 'BHR' 'BHS' 'BIH' 'BLR' 'BLZ' 'BOL' 'BRA'
 'BRB' 'BRN' 'BTN' 'BWA' 'CAF' 'CAN' 'CHE' 'CHL' 'CHN' 'CIV' 'CMR' 'COD'
 'COG' 'COK' 'COL' 'COM' 'CPV' 'CRI' 'CUB' 'CYP' 'CZE' 'DEU' 'DJI' 'DMA'
 'DNK' 'DOM' 'DZA' 'ECU' 'EGY' 'ERI' 'ESP' 'EST' 'ETH' 'EUU' 'FIN' 'FJI'
 'FRA' 'FSM' 'GAB' 'GBR' 'GEO' 'GHA' 'GIN' 'GMB' 'GNB' 'GNQ' 'GRC' 'GRD'
 'GTM' 'GUY' 'HND' 'HRV' 'HTI' 'HUN' 'IDN' 'IND' 'IRL' 'IRN' 'IRQ' 'ISL'
 'ISR' 'ITA' 'JAM' 'JOR' 'JPN' 'KAZ' 'KEN' 'KGZ' 'KHM' 'KIR' 'KNA' 'KOR'
 'KWT' 'LAO' 'LBN' 'LBR' 'LCA' 'LIE' 'LKA' 'LSO' 'LTU' 'LUX' 'LVA' 'MAR'
 'MCO' 'MDA' 'MDG' 'MDV' 'MEX' 'MHL' 'MKD' 'MLI' 'MLT' 'MMR' 'MNE' 'MNG'
 'MOZ' 'MRT' 'MUS' 'MWI' 'MYS' 'NAM' 'NER' 'NGA' 'NIC' 'NIU' 'NLD' 'NOR'
 'NPL' 'NRU' 'NZL' 'OMN' 'PAK' 'PAN

In [65]:
path_to_step2 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/"
jsonfile = cw_ndc.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_step2 +'output/html_para.json', 'w') as file:
    json.dump(parsed, file, indent=4)

In [74]:
cw_ndc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 634 entries, 0 to 633
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   filepath          634 non-null    object
 1   file_name         634 non-null    object
 2   country_code      634 non-null    object
 3   type_of_document  634 non-null    object
 4   language          634 non-null    object
 5   original_text     634 non-null    object
 6   paragraphs        634 non-null    object
dtypes: object(7)
memory usage: 34.8+ KB


# IKI

## Doc Collection

This subsection deals with fetching the documents from url links, alternatively explore website in case relevant document is not found. You can add some documents link (and place files in relevant folder) manually too and update the files list before processing next section.

In [None]:
path_to_step1 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step1/"
df_documents = pd.read_csv(path_to_step1 + "output/documents_list.csv")

print(f"Number of urls: {len(df_documents)}")
df_documents = df_documents.dropna(how='any', subset=['url', 'country_code',
                                                      'type_of_document'])
df_documents = df_documents.astype(str)
df_documents = df_documents.reset_index(drop = True)
df_documents['valid'] = False

print(f"Number of  valid url after cleaning: {len(df_documents)}")

# Check accessible documents
# Using a user agent header to not get banned by the CDN of UNFCCC
# response can vary based on User agent so be cautious
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}


def valid_doc(url):
    try:
        res = requests.head(url, headers=headers)
    except Exception as e:
        return False
    if res.status_code == 200:
        return True
    else:
        return False


# select valid documents
for i in tqdm(range(len(df_documents))):
    df_documents.loc[i,'valid' ] = valid_doc(df_documents.loc[i,'url'])

print('Number of valid documents: ', len(df_documents[df_documents['valid'] == True]))
print('Number of invalid documents: ', len(df_documents[df_documents['valid'] == False]))


We try to find some more valid by looping through a series of date subdomains (YYYY-MM)

In [None]:
# Review missing URLs
# Using a user agent header to not get banned by the CDN of UNFCCC
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}

# Get the last part of the url and loop through a series of date subdomains (YYYY-MM),
# testing for valid documents. If valid, get updated url.
def valid_doc_find(url, date_dir):
    for date in range(0, len(date_dir)):
        try:
            url_new = 'https://unfccc.int/sites/default/files/NDC/' + date_dir[date] \
                                                      + '/' + url.split('/')[-1]
            res = requests.head(url_new, headers=headers)
        except:
            res = None
        if res:
            if res.status_code == 200:
              return url_new
    return None


# Set dates vector
dates = ['2022-06', '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12','2023-01','2023-02','2023-03','2023-04','2023-05']

# valid_doc_find output to new column
df_documents['url_found_prg'] = None
for i in tqdm(range(len(df_documents))):
    if df_documents.loc[i,'valid'] == False:
        df_documents.loc[i,'url_found_prg'] = valid_doc_find(df_documents.loc[i,'url'], dates)


# update the url
df_documents['url'] = df_documents.apply(lambda x: x['url'] if x['valid'] == True
                          else ( x['url_found_prg'] if x['url_found_prg'] != None
                                else None), axis=1)
# making final validity check
df_documents['valid_final'] = df_documents.url.apply(lambda x: True if x else False)
print('Number of valid documents: ', len(df_documents[df_documents['valid_final'] == True]))
print('Number of invalid documents: ', len(df_documents[df_documents['valid_final'] == False]))
# drop the invalidated rows
df_documents = df_documents[df_documents['valid_final'] == True].drop(columns=['url_found_prg',
                                                                  'Unnamed: 0'])
df_documents = df_documents.reset_index(drop = True)
df_documents['file_type'] = df_documents.url.apply(lambda x: '.docx' if x.split('.')[-1] == 'docx' else '.pdf')

In [None]:
# download documents
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}

def download_doc(url):
    filename = hashlib.sha256(urllib.parse.unquote(url).encode('utf-8')).hexdigest()
    filename = Path(os.path.join('/content', 'data', 'downloaded_documents', filename + '.pdf'))

    if filename.exists():
        return filename.__str__()
    try:
        res = requests.get(url, headers=headers)
    except:
        print('Error downloading: ', url)
        return ''

    if not res.ok:
        print('Error downloading: ', url)
        return ''

    try:
        with open(filename, "wb") as fd:
            for chunk in res.iter_content(chunk_size=128):
                fd.write(chunk)
        return filename.__str__()
    except Exception as e:
        print(e)
        print('Error downloading: ', url, res)
        return ''

df_documents['document_path'] = df_documents.apply(lambda x: download_doc(x['url']),axis=1)

We save the file as csv and add any other enteries along with actual files to the folder for the next part. Also the pdf files are downloaded to folder 'downloaded_documents' in root. Move the same to step2 input folder.

In [None]:
path_to_step2 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/output/"
# df_documents.to_csv(path_to_step2+'document_list_updated.csv')

## Create Paragraphs

In [None]:
path_to_downloaded_docs = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/input/"
df_documents= pd.read_csv(path_to_step2 +'document_list_updated.csv')

import utils
from utils.preprocessing import UdfPreProcessor, FileConverter
from haystack.schema import Document
def extract_full_text(doc_path, file_name=None):
    """
    extracts the raw text from pdf/docx file using haystack utils

    Params
    --------------
    doc_path:  file path which needs text data extraction
    file_name: Filename


    Return
    ------------
    output: raw text from the file
    """
    # will use Fileconvertor to extract text
    if doc_path != '' and doc_path != 'nan' and doc_path != '..\\nan':
        convertor = FileConverter()
        doc_path = doc_path.replace("../data/",path_to_downloaded_docs)
        try:
            output = convertor.run(file_path = doc_path, file_name = doc_path.split("/")[-1])
        except Exception as e:
            print(e)
            output = None
        if output:
            output = output[0]['documents'][0]
            # output = output.content
        return output

In [None]:
# harmonizing data with already created climate Watch html paragraph data
df_documents['file_name'] = df_documents.url.apply(lambda x: str(x).split('/')[-1].replace('%','_'))
df_documents.drop(columns = ['Unnamed: 0', 'backend_status', 'current', 'date','url',
                             'version_number', 'comment','annex_type', 'income_group',
                             'region', 'eu27', 'ghg_total','ghg_transport','mena'], inplace=True)
df_documents.rename(columns = {'document_path':'filepath'}, inplace=True)


# extract text from file
df_documents['original_text'] = None
for i in tqdm(range(len(df_documents))):
    df_documents.loc[i,'original_text'] = extract_full_text(df_documents.loc[i]['filepath'])

# detect the langauge of text
import langdetect
df_documents['original_text'] = df_documents.original_text.apply(lambda x: x.content if x else None)
df_documents['language'] = df_documents.original_text.apply(lambda x: langdetect.detect(x) if x else None)


In [None]:
# detect the langauge of text
import langdetect
df_documents['original_text'] = df_documents.original_text.apply(lambda x: x.content if x else None)
df_documents['language'] = df_documents.original_text.apply(lambda x: langdetect.detect(x) if x else None)
df_documents.language.unique()

Verify the langauge if you see some weird results and update them. Langdetect is not deterministic, there can be sometimes variations.

In [None]:
# updating some entreies
# df_documents.loc[68,'language'] = 'en'
df_documents.dropna(subset = ['original_text'],inplace = True)
df_documents = df_documents.reset_index(drop=True)

In [None]:
# We can now create paragraphs, the language detection is done earlier to
# insure that we can use correct Punkt tokenizer for sentence splitting and words
# boundaries

df_documents['paragraphs'] = None
placeholder = []
for i in tqdm(range(len(df_documents))):
    try:
        placeholder.append(create_paragraphs_text(df_documents.loc[i,'original_text'],
                                    get_country_split_strategy(responsestats,df_documents.loc[i,'country_code']),
                                    df_documents.loc[i,'language'], df_documents.loc[i,'file_name']))
    except Exception as e:
        print('index:',i, '\n',e)
        placeholder.append({})
df_documents['paragraphs'] = placeholder
df_documents.info()
df_documents['paragraph_check'] = df_documents.paragraphs.apply(lambda x: True if x else False)

In [70]:
# do the paragraph check for this we use the default value of 180 words for each
# split type. We do this so that the while training of Models, we dont get error
# due to tensor size mismatch. alternatively you cna also set the truncation =True
# for model tokenizers.
# df_documents = df_documents[df_documents.paragraph_check == True].reset_index(drop=True)
df_documents['paragraphs'] = df_documents.paragraphs.progress_apply(lambda x: paraLengthCheck(x, keep_hash=False))

100%|██████████| 370/370 [00:03<00:00, 109.72it/s]


In [76]:
df_documents.drop(columns  = ['country','paragraphs1', 'paragraph_check'], inplace=True)
df_documents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   country_code      370 non-null    object
 1   type_of_document  370 non-null    object
 2   filepath          370 non-null    object
 3   file_name         370 non-null    object
 4   original_text     370 non-null    object
 5   language          370 non-null    object
 6   paragraphs        370 non-null    object
dtypes: object(7)
memory usage: 20.4+ KB


In [77]:
# if all is okay then we go ahead and save the dataframe as json
import json
path_to_step2 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/"
jsonfile = df_documents.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_step2 +'output/pdf_para.json', 'w') as file:
    json.dump(parsed, file, indent=4)

# Harmonize Both Dataset

In [84]:
cw_ndc.type_of_document.unique()

array(['First NDC', 'INDC', 'Revised First NDC', 'Second NDC',
       'Archived Second NDC', 'Archived Revised First NDC',
       'Revised Second NDC'], dtype=object)

In [85]:
df_documents.type_of_document.unique()

array(['First NDC', 'LTS', 'Revised First NDC', 'Second NDC',
       'Archived LTS', 'Archived Revised First NDC',
       'NDC reference document'], dtype=object)

In [83]:
# updating the file type
cw_ndc.type_of_document.replace({'Revised First NDC (archived)':'Archived Revised First NDC'}, inplace=True)

In [86]:
import json
path_to_step2 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/"
jsonfile = df_documents.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_step2 +'output/pdf_para.json', 'w') as file:
    json.dump(parsed, file, indent=4)


jsonfile = cw_ndc.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_step2 +'output/html_para.json', 'w') as file:
    json.dump(parsed, file, indent=4)