This notebook deals with finding relvant paragraph from already created paragraphs for html/pdf files for each response. Further as there are 3 category of paragraphs (depending on word count ~ [60,85,150] we do paragraph extraction for each split type. In addition we perform the document translation for the cases where there are more than 20 responses from the document but the language of document and responses dont match.


NOTE: Subsection like Translation of documents works best with GPU.

# Packages Installation

In [11]:
!pip install rank_bm25
!pip install transformers sentencepiece

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


# Read data

In [12]:
import pandas as pd
# setting it to see full output
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# we will need responses and paragraphs files (html, pdf)
path_to_step2 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step2/"
path_to_step3 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step3/"
cwiki_responses = pd.read_json(path_to_step3+"output/responses_grouped.json")
html_para = pd.read_json(path_to_step2+"output/html_para.json")
pdf_para = pd.read_json(path_to_step2+"output/pdf_para.json")

In [13]:
# we separate out iki and cw responses for while
# CW responsea are all english, IKI data has other languages
print("total rows", len(cwiki_responses))
cw_responses  = cwiki_responses[cwiki_responses.Source == 'CW'].reset_index(drop=True)
print("CW length",len(cw_responses))
iki_responses = cwiki_responses[cwiki_responses.Source == 'IKITracs'].reset_index(drop=True)
print("IKI length",len(iki_responses))

total rows 30404
CW length 26861
IKI length 3543


In [14]:
# checking for document type nomenclature
print(html_para.type_of_document.unique())
print(pdf_para.type_of_document.unique())
print(cwiki_responses.Document.unique())

['First NDC' 'INDC' 'Revised First NDC' 'Second NDC' 'Archived Second NDC'
 'Archived Revised First NDC' 'Revised Second NDC']
['First NDC' 'LTS' 'Revised First NDC' 'Second NDC' 'Archived LTS'
 'Archived Revised First NDC' 'NDC reference document']
['Second NDC' 'Revised First NDC' 'Archived LTS' 'LTS' 'First NDC' 'INDC'
 'NDC reference document' 'Archived Revised First NDC'
 'Archived Second NDC' 'WRI' 'Climate Act']


# Helper Functions

In [15]:
import logging
from rank_bm25 import BM25Okapi
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import spacy
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import _stop_words

# mapping the iso language codes to nltk package nomemclature
iso639_to_nltk = {
    "ru": "russian",
    "sl": "slovene",
    "es": "spanish",
    "sv": "swedish",
    "tr": "turkish",
    "cs": "czech",
    "da": "danish",
    "nl": "dutch",
    "en": "english",
    "et": "estonian",
    "fi": "finnish",
    "fr": "french",
    "de": "german",
    "el": "greek",
    "it": "italian",
    "no": "norwegian",
    "pl": "polish",
    "pt": "portuguese",
    "ml": "malayalam",
}


def sklearn_tokenize(text:str, removestopwords:bool = False):
    """
    takes text and tokenizes it using simple split() function, further removes/keep
    stopwords based on flag.

    Params
    -------------------
    text: text to be tokenized
    removestopwords: flag to inform if remove or keep the stopwords(sklearn)

    Returns
    -----------
    tokenized_doc: Tokenized text
    """
    # uses simple string split() to tokenize the text
    tokenized_doc = []
    if removestopwords == True:
        for token in text.lower().split():
            # remove the punctuations from word
            token = token.strip(string.punctuation)

            if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
                tokenized_doc.append(token)
    else:
        for token in text.lower().split():
            token = token.strip(string.punctuation)

            if len(token) > 0 :
                tokenized_doc.append(token)

    return tokenized_doc



def nltk_tokenize(text, language = 'en', removestopwords=False):
    """
    uses punkt tokenizer from nltk to tokenize based on language.
    languages supported by nltk are provided in is069 dictionary

    Params
    ------------------
    text: text to be tokenized
    langauge: language of text, used to determine which tokenizer to call for
    removestopwords: flag to inform if remove or keep the stopwords


    Returns
    ---------------
    tokenized_doc: Tokenized text

    """

    if language in iso639_to_nltk.keys():
        lang = iso639_to_nltk[language]
    else:
        logging.warning('langauge not found using English tokenizer and stopwords')
        lang = 'english'
    tokenslist = word_tokenize(text,language = lang)
    tokenized_doc = []
    # we can select to remove or keep stopwords
    if removestopwords == True:
        # stop words are fetched for particular language
        stop_words = stopwords.words(lang)
        for token in tokenslist:
            # remove the punctuations from word
            token = token.lower().strip(string.punctuation)
            if len(token) > 0 and token not in stop_words:
                tokenized_doc.append(token)
    else:
        for token in tokenslist:
            # remove the punctuations from word
            token = token.lower().strip(string.punctuation)
            if len(token) > 0:
                tokenized_doc.append(token)

    return tokenized_doc



def bm25_retriever(paraList, queries, language = 'en', top_k = 1):
    """
    okapi bm25 retriever to get best matching results bsed on token match

    Params
    ----------
    paraList: list of paragrpahs/text chunks to be considered
    queries: list of text chunks which are searched. Best paragraph match from
             paraList is to be searched for each query in queries list.
    langauage: langauge of paragraphs, used to find the tokenzier to be used
                for tokenization
    top_k: How many top_k best matching results need to fetched.


    Returns
    ----------------
    results:list[list of of paragraphs]. Each query in queries has a list of top_k best
                                          matches.

    """
    # okapi bm25 retriever to get best matching results bsed on tokens
    tokenized_corpus = []
    for passage in paraList:
        # tokeinze the paragraphs, if paragraphs of different langauge pass them
        # separately as 'language' param is applied to all paragraphs in list.
        tokenized_corpus.append(nltk_tokenize(passage,language= language))

    bm25 = BM25Okapi(tokenized_corpus)

    results = []
    for query in queries:
        # rememeber to pass the language explicity if text is non-english,
        # else backend will use english tokenizer by default
        results.append(bm25.get_top_n(nltk_tokenize(query,language=language),
                                                  paraList, n = top_k))

    return results

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


We have around 30k repsonses which need to be processed to find relevant paragraph. However the paragraphs list to be searched can be very big so limit the
search space by filtering the country and document type. For this we wrap this whole search into one function.

Note: Document type is not unqiue for each country, as there can be multiple docuemnts refering to lets say 'Revised First NDC' of a country as all revision are recorded as 'Revised First NDC'.

In [23]:
def process_country(countryCode,query_data,para_data, top_k = 3):
    """
    Takes the list of queries and filter out queriess by using the countryCode
    param. Uses Okapi bm25 retriever to find best match from para_data.

    Params
    ---------------
    countryCode: Alpha3 ISO code of country to filter out query dataset
    query_data: responses/query dataframe, must contain columns  = [ResponseText,Alpha3,Document]
    para_data: daatframe which contains paragraphs for country NDC documents
    top_k: number of top best matching results to keep


    Returns
    -------------
    data_df:dataframe with new columns in form of content{int}, where the int value
            goes from 0 to n-1, where n is number of split strategies present in
            paragraphs list data. For html and pdf data if used as it is, it will
            add columns = ['context0', 'context1','context2']. Each entry in
            column will have list of text. The list length is determined by top_k
            param.
    succesful_index: Index which are explored. This means if the Country, Document type
                      is found in the language of ResponseText. Language of ResponseText
                      is an aggregated value, Please check function body.
    not_succesfull_index: Index which are not explored.

    """

    # to collect index from original responses for which paragraph search is
    # succesful/unsucessful in this stage
    successfull_index = []
    notsuccessfull_index = []
    # defined how many best matching results to keep for each query
    top_k = top_k

    # filter out query data for CountryCode, needs the Alpha3 column for same.
    df = query_data[query_data.Alpha3 == countryCode]

    # to store the results from loop below
    df_placeholder = {}

    # iterate through each document category
    for doc_cat in list(df.Document.unique()):

          # getting the filtered view of dataframe for document type
          df1 = df[df.Document == doc_cat]

          # instead of iterating for each language we assume that the majority
          # class represents the main trend and hence use same to get the
          # relevant paragraphs from para_data. This will also be used to define
          # which language tokenizer should be used for bm25 retriever to work with
          response_lang = df1.language.value_counts().index.values[0]

          # deprecated part but good to keep for future reminder
          # retrieving passage by combining Response and  Sector
          # df1['Sector_Response'] = df1['Sector'] + ": " + df1['ResponseText']


          queries = list(df1.ResponseText)
          # filtering the para_data for CountryCode, Document type and language
          paragraphs  = para_data[(para_data.country_code == countryCode) &
                                  (para_data.type_of_document == doc_cat) &
                                  (para_data.language == response_lang)]['paragraphs'].values

          # there can be more than 1 collection of paragraphs (i,e more than 1 document
          # for a country and a Document type),  in that case we
          # need to make the paragraphs structure similar to what we have
          # for when number of such document = 1. Why more than 1 collection: Refer Note above
          if (len(paragraphs)==1):
              paragraphs = paragraphs[0]
          elif len(paragraphs) >=2:
              # keys are split strategy which tell how the paragraphs had
              # been created
              keys = list(paragraphs[0].keys())
              # we merge the paragraphs from different collections/documents
              placeholder = {key:[] for key in keys}

              for i in range(len(paragraphs)):
                  for key in keys:
                      placeholder[key] = placeholder[key] + paragraphs[i][key]
              paragraphs = placeholder

          # there could be no relevant paragraphs collection, we process further
          # only if found some relevant Document for a country in pdf_para
          if len(paragraphs)>0:
              paragraph_list = {}
              split_type = paragraphs.keys()
              # we will collect the response for each split type
              for split in split_type:
                  # getting only the text from the paragraphs list. Remember each
                  # entry in the list is actually a tuple of 3 elements
                  # (text,name of file,page number)
                  paragraph_list[split]  = [x[0] for x in paragraphs[split]]

              if type(queries) == list:
                  successfull_index += list(df1.index.values)
                  df1 = df1.reset_index(drop=False)
                  for i,split in enumerate(split_type):
                      results = bm25_retriever(queries=queries, top_k = top_k,
                              paraList= paragraph_list[split], language = response_lang)
                      # print("Queries:",len(queries), " Results:",len(results))
                      # add retrieved results and language param to dataframe
                      split_val = split.split("'split_length': ")[1][:2]
                      df1['context{}'.format(split_val)] = results
                      df1['context_lang'] = response_lang

              # add results to storage
              df_placeholder[doc_cat] = df1

          else:
              notsuccessfull_index += list(df1.index.values)
              logging.info("Please check the input type for the queries")
              print("ERROR: Either no paragraphs or issue with queries type:",
                                                      countryCode, doc_cat)
    if (list(df_placeholder.values())):
        data_df = pd.concat(list(df_placeholder.values()), ignore_index=True)
        return data_df, successfull_index, notsuccessfull_index
    else:
      return pd.DataFrame(), successfull_index, notsuccessfull_index

# THIS is copy of above fucntion just tweaked to adapt for translated document
# structure
def process_country_translated(countryCode,query_data,para_data, top_k = 3):
    # takes, country code to filter out data from para_data and query data
    # will use the Okapi bm25 retriever to find best match from paragraph data
    # for each query in query data

    # to collect index from original responses for which paragraph is
    # succesful/unsucessful in this stage
    successfull_index = []
    notsuccessfull_index = []
    # defined how many results to keep
    top_k = top_k
    df = query_data[query_data.Alpha3 == countryCode]

    # to store the results from loop below
    df_placeholder = {}

    # iterate through each document category
    for doc_cat in list(df.Document.unique()):
          # print(doc_cat)

          # getting the filtered view of dataframe for document type
          df1 = df[df.Document == doc_cat]

          # instead of iterating for each language we assume that the majority
          # class represents the main trend and hence use same to get the
          # relevant paragraphs from para_data. This will also be used to define
          # which language tokenizer should be used for bm25 retriever to work with
          response_lang = df1.language.value_counts().index.values[0]

          # deprecated part but good to keep for future reminder
          # retrieving passage by combining Response and  Sector
          # df1['Sector_Response'] = df1['Sector'] + ": " + df1['ResponseText']


          queries = list(df1.ResponseText)
          paragraphs  = para_data[(para_data.country_code == countryCode) &
                                  (para_data.type_of_document == doc_cat) &
            (para_data.translated_language == response_lang)]['para_translated'].values


          # there can be more than 1 collection of paragraphs
          # example any revision of NDC will be considered as 'Revised NDC'
                    # example any revision of NDC will be considered as 'Revised NDC'
          if (len(paragraphs)==1):
              paragraphs = paragraphs[0]

          if len(paragraphs)>0:
              paragraph_list = paragraphs
              split_type = paragraphs.keys()
              if type(queries) == list:
                  successfull_index += list(df1.index.values)
                  df1 = df1.reset_index(drop=False)
                  for i,split in enumerate(split_type):
                      results = bm25_retriever(queries=queries, top_k = top_k,
                              paraList= paragraph_list[split], language = response_lang)
                      # print("Queries:",len(queries), " Results:",len(results))
                      # add retrieved results and language param to dataframe
                      df1['context{}'.format(i)] = results
                      df1['context_lang'] = response_lang

              # add results to storage
              df_placeholder[doc_cat] = df1
          else:
            notsuccessfull_index += list(df1.index.values)
            logging.info("Please check the input type for the queries")
            print("ERROR: Either no paragraphs or issue with queries type:",
                                                    countryCode, doc_cat)
    if (list(df_placeholder.values())):
        data_df = pd.concat(list(df_placeholder.values()), ignore_index=True)
        return data_df, successfull_index, notsuccessfull_index
    else:
      return pd.DataFrame(), successfull_index, notsuccessfull_index

# CW

Explore html_para

In [24]:
import pandas as pd
from tqdm.autonotebook import tqdm
# storage to collect all results/outputs from loop below
placeholder = []

# we need to keep track of which indexes are explored/not_explored
# so that some other steps can be taken to them before discarding
cw_index_explored = []
cw_index_not_explored = []

# we iterate through countries at a time
# the document type is taken care of method defined above process_country()
country_list = list(cw_responses.Alpha3.unique())
for country in tqdm(country_list):

    df,index_success, index_not = process_country(country,cw_responses,html_para)
    placeholder.append(df)
    cw_index_explored += index_success
    cw_index_not_explored += index_not

cw_responses_context = pd.concat(placeholder, ignore_index =True)
print(len(cw_responses), len(cw_responses_context))
print(len(cw_index_explored), len(cw_index_not_explored))

  0%|          | 0/166 [00:00<?, ?it/s]

ERROR: Either no paragraphs or issue with queries type: ETH Revised First NDC
ERROR: Either no paragraphs or issue with queries type: CHL Second NDC
ERROR: Either no paragraphs or issue with queries type: VNM Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: IDN Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: PRK Revised First NDC
ERROR: Either no paragraphs or issue with queries type: BEN Revised First NDC
ERROR: Either no paragraphs or issue with queries type: MRT Revised First NDC
ERROR: Either no paragraphs or issue with queries type: MRT INDC
ERROR: Either no paragraphs or issue with queries type: SLV INDC
ERROR: Either no paragraphs or issue with queries type: PAK Revised First NDC
ERROR: Either no paragraphs or issue with queries type: COD Revised First NDC
ERROR: Either no paragraphs or issue with queries type: BDI Revised First NDC
ERROR: Either no paragraphs or issue with queries type: MLI Revised First NDC
E

Explore pdf_para

In [26]:
import pandas as pd
from tqdm.autonotebook import tqdm
# storage to collect all results/outputs from loop below
pdf_placeholder = []

# we need to keep track of which indexes are explored/not_explored
# so that some other steps can be taken to them before discarding
cw_index_explored_pdf= []
cw_index_not_explored_pdf = []

# we iterate through countries at a time
# the document type is taken care of method defined above process_country()
country_list = list(cw_responses.Alpha3.unique())
for country in tqdm(country_list):
    df,index_success, index_not = process_country(country,cw_responses,pdf_para)
    pdf_placeholder.append(df)
    cw_index_explored_pdf += index_success
    cw_index_not_explored_pdf += index_not

cw_responses_context_pdf = pd.concat(pdf_placeholder, ignore_index =True)
print(len(cw_responses), len(cw_responses_context_pdf))
print(len(cw_index_explored_pdf), len(cw_index_not_explored_pdf))

  0%|          | 0/166 [00:00<?, ?it/s]

ERROR: Either no paragraphs or issue with queries type: SEN First NDC
ERROR: Either no paragraphs or issue with queries type: SEN INDC
ERROR: Either no paragraphs or issue with queries type: ETH INDC
ERROR: Either no paragraphs or issue with queries type: ARE INDC
ERROR: Either no paragraphs or issue with queries type: KHM INDC
ERROR: Either no paragraphs or issue with queries type: CHL Second NDC
ERROR: Either no paragraphs or issue with queries type: CHL INDC
ERROR: Either no paragraphs or issue with queries type: VNM INDC
ERROR: Either no paragraphs or issue with queries type: IDN Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: IDN INDC
ERROR: Either no paragraphs or issue with queries type: PRK INDC
ERROR: Either no paragraphs or issue with queries type: BEN Revised First NDC
ERROR: Either no paragraphs or issue with queries type: BEN INDC
ERROR: Either no paragraphs or issue with queries type: BEN First NDC
ERROR: Either no paragraphs or issue wi

Merge all the things together

In [28]:
# we collect the paragraphs list and store in dict with key reflecting the
# average word count strategy 60 = small, 85 = medium, 150 = large (context15)
cw_responses_context['context_html'] = cw_responses_context.apply(lambda x:
                                            {'small':x['context60'],
                                            'medium':x['context85'],
                                            'large':x['context15']}, axis=1)
cw_responses_context.drop(columns = ['context60', 'context85', 'context15'], inplace=True)
# set the index to same as indexes reflected in cw_responses, which is contained
# in col = ['index']
cw_responses_context.set_index(['index'], inplace=True, verify_integrity=True)
# applying the same operation to pdf collected paragraphs
cw_responses_context_pdf['context_pdf'] = cw_responses_context_pdf.apply(lambda x:
                                              {'small':x['context60'],
                                               'medium':x['context85'],
                                               'large':x['context15']}, axis=1)
cw_responses_context_pdf.drop(columns = ['context60', 'context85', 'context15'], inplace=True)
# set the index to same as indexes reflected in cw_responses, which is contained
# in col = ['index']
cw_responses_context_pdf.set_index(['index'], inplace=True, verify_integrity=True)

In [29]:
# keeping only relevant columns
cw_responses_context = cw_responses_context[['ResponseText','context_html']]
cw_responses_context_pdf = cw_responses_context_pdf[['ResponseText','context_pdf']]
# First merge the two dataframes to get complete list of ResponseText
# which are either explored using PDF_para or html_para
temp = cw_responses_context.merge(cw_responses_context_pdf, how = 'outer', on= 'index')

# Merge the above dataframe with the original cw_responses dataframe created in subsection\
# Read Data to get a full picture.
cw_responses = cw_responses.reset_index(drop=False)
cw_responses = cw_responses.merge(temp, how='outer', on='index')

In [30]:
# data validations
cw_explored = set(cw_index_explored).union(set(cw_index_explored_pdf))
cw_unexplored = set(cw_index_not_explored).intersection(set(cw_index_not_explored_pdf))
print(len(cw_responses), len(cw_explored), len(cw_unexplored))
print(len(cw_responses) - len(cw_explored) == len(cw_unexplored))

26861 24517 2344
True


In [31]:
print("Responses explored in HTML:", len(cw_index_explored))
print("Responses explored in PDF:", len(cw_index_explored_pdf))
# Responses which are unexplored in both pdf_para and html_para
print("Responses Explored", len(cw_explored))
print("Responses Unexplored:", len(cw_unexplored))

Responses explored in HTML: 23659
Responses explored in PDF: 13615
Responses Explored 24517
Responses Unexplored: 2344


In [32]:
# cw_responses.drop(columns  = ['ResponseText_x', 'ResponseText_y', 'index'], inplace=True)
cw_responses['context_lang_html'] = cw_responses.apply(lambda x: 'en'
                                    if str(x['context_html']) != 'nan' else None ,axis=1)
cw_responses['context_lang_pdf'] = cw_responses.apply(lambda x: 'en'
                                    if str(x['context_pdf']) != 'nan' else None ,axis=1)
cw_responses.drop(columns  = ['ResponseText_x', 'ResponseText_y', 'index'], inplace=True)

In [33]:
# #save the results if you want
# import json
# path_to_step4 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step4/"
# jsonfile = cw_responses.to_json(orient="records")
# parsed = json.loads(jsonfile)
# with open(path_to_step4 +'output/cw_responses_context.json', 'w') as file:
#     json.dump(parsed, file, indent=4)

# IKI

In [34]:
import pandas as pd
from tqdm.autonotebook import tqdm
# storage to collect all results/outputs from loop below
placeholder = []

# we need to keep track of which indexes are explored/not_explored
# so that some other steps can be taken to them before discarding
iki_index_explored = []
iki_index_not_explored = []

# we iterate through countries at a time
# the document type is taken care of method defined above process_country()
country_list = list(iki_responses.Alpha3.unique())
for country in tqdm(country_list):

    df,index_success, index_not = process_country(country,iki_responses,html_para)
    placeholder.append(df)
    iki_index_explored += index_success
    iki_index_not_explored += index_not

iki_responses_context = pd.concat(placeholder, ignore_index =True)
print(len(iki_responses), len(iki_responses_context))
print(len(iki_index_explored), len(iki_index_not_explored))

  0%|          | 0/192 [00:00<?, ?it/s]

ERROR: Either no paragraphs or issue with queries type: ARG NDC reference document
ERROR: Either no paragraphs or issue with queries type: CAN LTS
ERROR: Either no paragraphs or issue with queries type: JPN Archived LTS
ERROR: Either no paragraphs or issue with queries type: JPN LTS
ERROR: Either no paragraphs or issue with queries type: JPN Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: CHN LTS
ERROR: Either no paragraphs or issue with queries type: COL LTS
ERROR: Either no paragraphs or issue with queries type: LUX LTS
ERROR: Either no paragraphs or issue with queries type: GBR LTS
ERROR: Either no paragraphs or issue with queries type: GBR Archived LTS
ERROR: Either no paragraphs or issue with queries type: GBR Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: ETH Revised First NDC
ERROR: Either no paragraphs or issue with queries type: LVA LTS
ERROR: Either no paragraphs or issue with queries type: YEM First NDC
E

In [35]:
import pandas as pd
from tqdm.autonotebook import tqdm
# storage to collect all results/outputs from loop below
pdf_placeholder = []

# we need to keep track of which indexes are explored/not_explored
# so that some other steps can be taken to them before discarding
iki_index_explored_pdf= []
iki_index_not_explored_pdf = []

# we iterate through countries at a time
# the document type is taken care of method defined above process_country()
country_list = list(iki_responses.Alpha3.unique())
for country in tqdm(country_list):

    df,index_success, index_not = process_country(country,iki_responses,pdf_para)
    pdf_placeholder.append(df)
    iki_index_explored_pdf += index_success
    iki_index_not_explored_pdf += index_not

iki_responses_context_pdf = pd.concat(pdf_placeholder, ignore_index =True)
print(len(iki_responses), len(iki_responses_context_pdf))
print(len(iki_index_explored_pdf), len(iki_index_not_explored_pdf))

  0%|          | 0/192 [00:00<?, ?it/s]

ERROR: Either no paragraphs or issue with queries type: ARG Second NDC
ERROR: Either no paragraphs or issue with queries type: ARG First NDC
ERROR: Either no paragraphs or issue with queries type: ARG NDC reference document
ERROR: Either no paragraphs or issue with queries type: CAN LTS
ERROR: Either no paragraphs or issue with queries type: JPN Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: CHN First NDC
ERROR: Either no paragraphs or issue with queries type: GBR Archived Revised First NDC
ERROR: Either no paragraphs or issue with queries type: BFA First NDC
ERROR: Either no paragraphs or issue with queries type: YEM INDC
ERROR: Either no paragraphs or issue with queries type: IRQ INDC
ERROR: Either no paragraphs or issue with queries type: IRQ Revised First NDC
ERROR: Either no paragraphs or issue with queries type: JOR Revised First NDC
ERROR: Either no paragraphs or issue with queries type: PRY First NDC
ERROR: Either no paragraphs or issue with 

In [37]:
# we collect the paragraphs list and store in dict with key reflecting the
# average wprd count strategy 60 = small, 85 = medium, 150 = large
iki_responses_context['context_html'] = iki_responses_context.apply(lambda x:
                                                {'small':x['context60'],
                                                'medium':x['context85'],
                                                'large':x['context15']}, axis=1)
iki_responses_context.drop(columns = ['context60', 'context85', 'context15'], inplace=True)
# set the index to same as indexes reflected in cw_responses, which is contained
# in col = ['index']
iki_responses_context.set_index(['index'], inplace=True, verify_integrity=True)
# applying same operation
iki_responses_context_pdf['context_pdf'] = iki_responses_context_pdf.apply(lambda x:
                                                  {'small':x['context60'],
                                                  'medium':x['context85'],
                                                  'large':x['context15']}, axis=1)
iki_responses_context_pdf.drop(columns = ['context60', 'context85', 'context15'], inplace=True)
# set the index to same as indexes reflected in cw_responses, which is contained
# in col = ['index']
iki_responses_context_pdf.set_index(['index'], inplace=True, verify_integrity=True)

In [38]:
iki_responses_context.rename(columns = {'context_lang':'context_lang_html'}, inplace=True)
iki_responses_context_pdf.rename(columns = {'context_lang':'context_lang_pdf'}, inplace=True)
# keeping only relevant columns
iki_responses_context = iki_responses_context[['ResponseText','context_html',
                                               'context_lang_html']]
iki_responses_context_pdf = iki_responses_context_pdf[['ResponseText','context_pdf',
                                                       'context_lang_pdf']]
# First merge the two dataframes to get complete list of ResponseText
# which are either explored using PDF_para or html_para
temp = iki_responses_context_pdf.merge(iki_responses_context, how = 'outer', on= 'index')
iki_responses = iki_responses.reset_index(drop=False)
# Merge the above dataframe with the original cw_responses dataframe created in subsection\
# Read Data to get a full picture.
iki_responses = iki_responses.merge(temp, how='outer', on='index')

In [40]:
# data validation
iki_explored = set(iki_index_explored).union(set(iki_index_explored_pdf))
iki_unexplored = set(iki_index_not_explored).intersection(set(iki_index_not_explored_pdf))
print(len(iki_responses), len(iki_explored), len(iki_unexplored))
print(len(iki_responses) - len(iki_explored) == len(iki_unexplored))

3543 3289 254
True


In [41]:
print("Responses explored in HTML:", len(iki_index_explored))
print("Responses explored in PDF:", len(iki_index_explored_pdf))
# Responses which are unexplored in both pdf_para and html_para
print("Responses Explored", len(iki_explored))
print("Responses Unexplored:", len(iki_unexplored))

Responses explored in HTML: 2062
Responses explored in PDF: 3031
Responses Explored 3289
Responses Unexplored: 254


In [42]:
iki_responses.drop(columns  = ['ResponseText_x', 'ResponseText_y', 'index'], inplace=True)

In [43]:
# #save the results if you want
# import json
# path_to_step4 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step4/"
# jsonfile = iki_responses.to_json(orient="records")
# parsed = json.loads(jsonfile)
# with open(path_to_step4 +'output/iki_responses_context.json', 'w') as file:
#     json.dump(parsed, file, indent=4)

# CWIKI

In this section we merge the two datasets. We will look at unexplored category  using the downloaded documents we have collected in html and pdf format by translating them.

In [44]:
import pandas as pd

# reading data saved from subsection cw, IKI
path_to_step4 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step4/"
cw_responses = pd.read_json(path_to_step4 + "output/cw_responses_context.json")
iki_responses = pd.read_json(path_to_step4 + "output/iki_responses_context.json")

CW team had created the html files and therefore it;s good to aasume that the extarcted ResponseText from these files. Therefore for CW we prefer paragraphe xtraction from html files. While the case for IKI is opposite as team relies on
PDF documents provided at UNFCCC website. Therefore we prefer the pdf fiels extracted paragraphs over html.

The content of both are ideally same for the country document, however due to parsing of document (examples tables etc), the extacted text can have some dissimilarities.

In [45]:
# adding the extracted paragraph in column 'context'.
# for CW responses we prefer the html based extracted paragraphs, if there is no
# extracted paragraphs from html then we check in pdf extracted paragraphs. If both
# are None then set the value to None.
cw_responses['context'] = cw_responses.apply(lambda x: x['context_html']
                                        if x['context_html'] is not None
                                       else x['context_pdf']
                                             if x['context_pdf'] is not None
                                             else None,axis=1)
cw_responses['context_lang'] = cw_responses.apply(lambda x: 'en'
                                                  if x['context'] is not None
                                                  else None, axis=1)

cw_responses.drop(columns = ['context_pdf', 'context_html','context_lang_html',
                             'context_lang_pdf'], inplace=True)

In [46]:
# adding the extracted paragraph in column 'context'.
# for CW responses we prefer the pdf based extracted paragraphs, if there is no
# extracted paragraphs from pdf then we check in html extracted paragraphs. If both
# are None then set the value to None.
iki_responses['context'] = iki_responses.apply(lambda x: x['context_pdf']
                                        if x['context_pdf'] is not None
                                       else x['context_html']
                                             if x['context_html'] is not None
                                             else None,axis=1)
iki_responses['context_lang'] = iki_responses.apply(lambda x: x['context_lang_pdf']
                                        if x['context_lang_pdf'] is not None
                                       else x['context_lang_html']
                                             if x['context_lang_html'] is not None
                                             else None,axis=1)
iki_responses.drop(columns = ['context_pdf', 'context_html','context_lang_html',
                             'context_lang_pdf'], inplace=True)

In [47]:
# As Response language is aggrgated value some response might be different.
# we will separate them out
mismatch = iki_responses[(iki_responses.language != iki_responses.context_lang)
                                        & (iki_responses.context_lang.notna())]
# collecting the Responses for which we couldnt perform paragraph extraction
# Reasons primarily being non-availability of Document.
iki_unexplored = iki_responses[iki_responses.context.isna()]
iki_responses = iki_responses[iki_responses.context.notna() & (iki_responses.language == iki_responses.context_lang)]
cw_unexplored = cw_responses[cw_responses.context.isna()]
cw_responses = cw_responses[cw_responses.context.notna()]

print(len(cw_responses), len(cw_unexplored))
print(len(iki_responses), len(iki_unexplored))
print(len(mismatch))

24517 2344
3266 254
23


In [48]:
# merging results to form uniform 'Explored' dataset and un-Explored datatset.
# Explored = Paragraph extraction was performed, unexplored = PAragrapg Extraction
# not possible due to reasons mentioned above.
explored_df = pd.concat([cw_responses, iki_responses], ignore_index=True)
unexplored_df = pd.concat([mismatch, cw_unexplored, iki_unexplored], ignore_index=True)
unexplored_df.drop(columns = ['context','context_lang'], inplace=True)
print(len(explored_df), len(unexplored_df))

27783 2621


## Translation of documents

Reasons for Unexplored: Document not found

>  - Document in the required langauge not present
>  - Document not (language agnostic) present in the collection we have

While we cannot work on second case, we can try to work on first issue, by translating the document to desired langauge if possible. As this step is compute intensive, we will focus our effort for documents which are of high priority (i.e have lot of row enteries in Unexplored data)


In [None]:
# get list of documents which need translation

# getting the RepsonseText count for Country, Document Type
abc = unexplored_df.groupby(['Alpha3','Document','language']).size().reset_index(name='count')
abc['pdf'] = None
abc['html'] = None

# first try to find the Document for Country in pdf files (remember we are not
# looking at raw files, but the paragraph files html_para and pdf_para). We
# ignore the language parameter.
abc['pdf'] = abc.apply(lambda x: pdf_para[(pdf_para.country_code  == x['Alpha3']) &
                (pdf_para.type_of_document  == x['Document'])][['file_name','language','paragraphs']].values , axis=1)
# try to find relevant document in html_para.
abc['html'] = abc.apply(lambda x: html_para[(html_para.country_code  == x['Alpha3']) &
                    (html_para.type_of_document  == x['Document'])][['file_name','language','paragraphs']].values , axis=1)

# we will focus only those documents which have more than 20 responses in our unepxlored dataset
abc['check'] = abc.apply(lambda x: ((len(x['pdf'])>0) | (len(x['html'])>0)) & (x['count'] >20) ,axis=1)
to_be_translated = abc[abc.check == True].reset_index(drop=True)

# extracing  relevant information which will be needed for document translation
to_be_translated['context_lang'] = to_be_translated.apply(lambda x:x['html'][0][1]
                                    if len(x['html']) >0 else x['pdf'][0][1],axis=1)
to_be_translated['paragraphs'] = to_be_translated.apply(lambda x:x['html'][0][2]
                                    if len(x['html']) >0 else x['pdf'][0][2],axis=1)

to_be_translated.drop(columns  = ['pdf', 'html', 'check'], inplace=True)

Starting the Document Translation

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")

# define the language mapping. Check the language of documents we have and perform
# maping as defined at  https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt

language_dict = {'fr':'fr_XX',
                 'es':'es_XX',
                 'zh-cn':'zh_CN',
                 'ar':'ar_AR'}

def translate(lang, paragraph):
    """
    Takes text and translate it to other langauge, defualt ='english'

    Params
    -----------------
    lang: langauge of the text/paragraph
    paragraph: text chunk to be translated


    Returns
    ----------
    Text: Translated text ('english')
    """
    tokenizer.src_lang = language_dict[lang]
    encoded_hi = tokenizer(paragraph, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_hi)
    return(''.join(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)))

In [None]:
#storage to collect all responses
placeholder = []
from tqdm import tqdm
import json

# iterate though the dataframe
df = to_be_translated
for i in range(len(df)):
    print(i)
    # get paragraphs list, Remember the column 'paragraphs' has a dictionary with
    # keys as split strategy and values as list of paragraphs.
    # paragraphs: Dict(Key: Value), Key = split_strategy, Value =List[List],
    # where inner list is [context, filename, pagenumber]
    paragraphs_list = list(df.loc[i,'paragraphs'].values())
    temp = {'small':[],'medium':[],'large':[]}
    # iterate though the splits
    for j,list_ in enumerate(paragraphs_list):
            # iterate through the paragraphs list
            for para in tqdm(list_):
                if j==0:
                    temp['small'].append(translate(df.loc[i,'context_lang'],para[0]))
                if j ==1:
                    temp['medium'].append(translate(df.loc[i,'context_lang'],para[0]))
                if j ==2:
                    temp['large'].append(translate(df.loc[i,'context_lang'],para[0]))

    # append the output of document to storage variable
    placeholder.append({'country_code':df.loc[i,'Alpha3'],
                      'type_of_document':df.loc[i,'Document'],
                      'language':df.loc[i,'context_lang'],
                      'para_translated':temp})

# save the output
with open(path_to_step4 + "output/translated.json", 'w') as file:
      json.dump(placeholder,file, indent=4)

In [49]:
translated_docs = pd.read_json(path_to_step4+'output/translated.json')
translated_docs.rename(columns={'language':'doc_language'}, inplace=True)
translated_docs['translated_language'] = 'en'
translated_docs.head()

Unnamed: 0,country_code,type_of_document,doc_language,para_translated,translated_language
0,BDI,Revised First NDC,fr,{'small': ['i REPUBLIC OF BURUNDI DETERMINED C...,en
1,BEN,Revised First NDC,fr,{'small': ['DETERMINED CONTRIBUTION TO THE UPD...,en
2,CIV,INDC,fr,{'small': ['PLANNED CONTRIBUTIONS DETERMINED A...,en
3,CMR,INDC,fr,{'small': ['REPUBLIC OF CAMEROON. INTENDED NAT...,en
4,COD,Revised First NDC,fr,{'small': ['DEMOCRATIC REPUBLIC OF CONGO S NAT...,en


In [50]:
translated_docs.doc_language.unique()

array(['fr', 'es'], dtype=object)

## Paragraphs from Translated Docs

In [51]:
import pandas as pd
from tqdm.autonotebook import tqdm
# storage to collect all results/outputs from loop below
placeholder = []

# we need to keep track of which indexes are explored/not_explored
updated_index_explored = []
updated_index_not_explored = []

# iterate through unexplored_df by one country at a time
country_list = list(unexplored_df.Alpha3.unique())
for country in tqdm(country_list):

    df,index_success, index_not = process_country_translated(country,unexplored_df,translated_docs)
    placeholder.append(df)
    updated_index_explored += index_success
    updated_index_not_explored += index_not

# collect all extracted results
translated_responses_context = pd.concat(placeholder, ignore_index =True)
print(len(unexplored_df), len(translated_responses_context))
print(len(updated_index_explored), len(updated_index_not_explored))

  0%|          | 0/53 [00:00<?, ?it/s]

ERROR: Either no paragraphs or issue with queries type: BFA Revised First NDC
ERROR: Either no paragraphs or issue with queries type: DOM Revised First NDC
ERROR: Either no paragraphs or issue with queries type: NER Revised First NDC
ERROR: Either no paragraphs or issue with queries type: MCO Revised First NDC
ERROR: Either no paragraphs or issue with queries type: HTI First NDC
ERROR: Either no paragraphs or issue with queries type: HTI Revised First NDC
ERROR: Either no paragraphs or issue with queries type: COG First NDC
ERROR: Either no paragraphs or issue with queries type: COG Revised First NDC
ERROR: Either no paragraphs or issue with queries type: GIN Revised First NDC
ERROR: Either no paragraphs or issue with queries type: TCD Revised First NDC
ERROR: Either no paragraphs or issue with queries type: GNQ First NDC
ERROR: Either no paragraphs or issue with queries type: AGO Revised First NDC
ERROR: Either no paragraphs or issue with queries type: CRI Revised First NDC
ERROR: Eit

In [52]:
# we collect the paragraphs list and store in dict with key reflecting the
# average wprd count strategy 60 = small, 85 = medium, 150 = large
translated_responses_context['context'] = translated_responses_context.apply(lambda x:
                                            {'small':x['context0'],
                                            'medium':x['context1'],
                                            'large':x['context2']}, axis=1)
translated_responses_context.drop(columns = ['context0', 'context1', 'context2',
                                             'index'], inplace=True)

In [54]:
responses = pd.concat([explored_df, translated_responses_context], ignore_index=True)
# #save the results
import json
path_to_step4 = "/content/drive/MyDrive/Colab Notebooks/CPU/Step4/"
jsonfile = responses.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_step4 +'output/responses_context.json', 'w') as file:
    json.dump(parsed, file, indent=4)

In [55]:
responses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29762 entries, 0 to 29761
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ResponseText       29762 non-null  object 
 1   Alpha3             29762 non-null  object 
 2   Country            29762 non-null  object 
 3   Document           29762 non-null  object 
 4   IkiInfo            3266 non-null   object 
 5   Source             29762 non-null  object 
 6   ResponseWordcount  29762 non-null  int64  
 7   Target             29762 non-null  bool   
 8   Netzero            3266 non-null   float64
 9   Adaptation         29762 non-null  bool   
 10  Mitigation         29762 non-null  bool   
 11  GHG                270 non-null    float64
 12  Conditional        29762 non-null  bool   
 13  Unconditional      29762 non-null  bool   
 14  language           29762 non-null  object 
 15  CWInfo             26496 non-null  object 
 16  Action             264