### Imports

In [11]:
import re
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Set

!pip install PyMuPDF
import fitz #PyMuPDF Library for reading pdf files

!pip install transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_tensor_type(torch.cuda.FloatTensor)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
TEXT_FILE_PATH = None
BASE_PATH = '/content/drive/MyDrive/fyp_2_v2/'
PDF_FILE_PATHS = ['UN.pdf',
                  'TAMAS.pdf',
                  'JICA.pdf']
PDF_FILE_PATHS = [BASE_PATH + file_path for file_path in PDF_FILE_PATHS]

### Read Text Corpus

In [5]:
def read_text_file(file_path: str) -> str:
    """
    Reads a text file in utf-8 format. 
    If unable to read file, empty string is returned.
    
    Input:
        file_path: Path of file to be read
        
    Return: Text in file.
    """
    try:
        with open(file_path, "rb") as f:
            file_data =  f.read().decode('utf-8') 
        return file_data
    except:
        print("Failed to read text file!!!")
        return ""

text_corpus = read_text_file(TEXT_FILE_PATH)

def clean_text_corpus(corpus: str) -> str:
    """
    Performs Data Cleaning on corpus
    
    Input:
        corpus: Corpus to clean
        
    Return: Cleaned corpus
    """
    corpus = corpus.strip() #Remove spaces from beginning and end of string
    corpus = re.sub(r'\r', '', corpus) #Remove Carriage Return character
    corpus = re.sub(r'[\n]+', '\n', corpus) #Remove multiple newline characters with single newline
    corpus = re.sub(r'---', '', corpus) #Remove ---
    return corpus

def extract_sentences_from_corpus(corpus: str) -> List[str]:
    """
    Returns list of sentences from corpus
    
    Input:
        corpus: Corpus to extract sentences from
            
    Return: List of sentences
    """
    corpus_sentences = []
    for segment in corpus.split('\n'):
        corpus_sentences.append(segment)
    return corpus_sentences

text_corpus = clean_text_corpus(text_corpus)
text_corpus_sentences = extract_sentences_from_corpus(text_corpus)

for i in range(min(len(text_corpus_sentences),20)):
    print(text_corpus_sentences[i])
    print('---------------')

Failed to read text file!!!

---------------


### Read PDF Corpus

In [6]:
def read_pdf_file(file_path: str) -> List[str]:
    """
    Reads a pdf file, and returns list of page text content. 
    If unable to read file, empty list is returned.
    
    Input:
        file_path: Path of file to be read
        
    Return: List where each element is text content of a page.
    """
    try:
        text_content_list = []
        with fitz.open(file_path) as f:
            for page in f:
                page_content = ''
                for block in page.get_text("dict")['blocks']:
                    if "lines" in block.keys():
                        txt = [lines['text'] for span in block['lines'] for lines in span['spans']]
                        if len(txt)>0:
                            for t in txt:
                                if t[-1]=='.':
                                    page_content += t + ' '
                                else:
                                    page_content += t
                if len(page_content) > 0:
                    text_content_list.append(page_content)
        return '\n'.join(text_content_list)
    except:
        print("Failed to read pdf file!!!")
        return []

def clean_pdf_corpus(corpus: str) -> List[str]:
    """
    Performs Data Cleaning on pdf corpus
    
    Input:
        corpus: Pdf corpus to clean
    
    Return: Cleaned pdf corpus
    """
    clean_corpus = ''.join([char for char in corpus if char.isascii()])
    clean_corpus = re.sub(r'[\r\t]', ' ', clean_corpus) #Replace newline,carriage return, and tab with space
    clean_corpus = clean_corpus.strip() #Remove spaces from beginning and end of string
    clean_corpus = re.sub(r'[ ]{2,}', ' ', clean_corpus) #Replace multiple spaces with single space  

    return clean_corpus
    
pdf_corpus = ''
for pdf_file_path in PDF_FILE_PATHS:
    pdf_corpus += '\n' + read_pdf_file(pdf_file_path)
pdf_corpus = clean_pdf_corpus(pdf_corpus)
pdf_corpus_sentences = extract_sentences_from_corpus(pdf_corpus)

for i in range(len(pdf_corpus_sentences)):
    print(pdf_corpus_sentences[i])
    print('---------------')

ii Crop production manual A guide to fruit and vegetable production in the Federated States of Micronesia 
---------------
iii 
---------------
iv Crop production manual A guide to fruit and vegetable production in the Federated States of Micronesia Compiled by: Sayed Mohammad Naim Khalid This manual was produced under TCP/MIC/3601, Strengthening the capacity of Farmers Associations to increase production and marketing of root crops, fruits and vegetables in FSM project. FAO Subregional Office for the Pacific Food and Agriculture Organization of the United Nations Apia, 2020 
---------------
v Required citation: FAO. 2020. Crop production manual. Appia. The designations employed and the presentation of material in this information product do not imply the expression of any opinion whatsoever on the part of the Food and Agriculture Organization of the United Nations (FAO) concerning the legal or development status of any country, territory, city or area or of its authorities, or concern

### Combine Text and PDF Corpus

In [7]:
def clean_sentence(sentence: str, stopword_removal: bool = True) -> str:
    """
    Performs Data Cleaning on sentence
    
    Input:
        sentence: Sentence to clean
        stopword_removal: If True, stopwords are removed from sentence
    
    Return: Cleaned sentence
    """
    sentence = sentence.lower() #Convert to lower case
    sentence = sentence.strip() #Remove spaces from beginning and end of string
    sentence = re.sub(r'[^\w\d\s]', ' ', sentence) #Remove punctuation
    sentence = re.sub(r'(\b\w\b)', '', sentence) #Remove dangling characters
    sentence = re.sub(r'[ ]{2,}', ' ', sentence) #Replace multiple contiguous spaces with single space
    if stopword_removal:
         sentence = remove_stopwords(sentence) #Remove stopwords
    return sentence

def remove_duplicate_sentences(corpus_sentences: List[str]) -> List[str]:
    """
    Removes duplicate sentences from list.
    Sentences are cleaned and stemming is performed before comparison
    Only alphanumeric portion of strings are used for comparison
    
    Input:
        corpus_sentences: List of sentences from corpus
            
    Return: List of sentences from corpus with duplicates removed.
    """
    unique_corpus_sentences = []
    unique_values = set()
    for sent in corpus_sentences:
        sent_clean = clean_sentence(sent)
        sent_alphanum = re.sub(r'[^\w\d]','',sent_clean)
        sent_alphanum_lower = sent_alphanum.lower()
        if sent_alphanum_lower in unique_values:
            continue
        else:
            unique_corpus_sentences.append(sent)
            unique_values.add(sent_alphanum_lower)
    
    return unique_corpus_sentences

corpus_sentences = text_corpus_sentences + pdf_corpus_sentences
print("Number of sentences:",len(corpus_sentences))
corpus_sentences = remove_duplicate_sentences(corpus_sentences)
print("Number of unique sentences:",len(corpus_sentences))

Number of sentences: 367
Number of unique sentences: 366


### Preprocess Corpus Sentences

In [8]:
def get_preprocessed_sentences(sentences: List[str],stopword_removal: bool = True) -> List[str]:
    """
    Perform cleaning and stemming on dataset
    
    Input:
        sentences: Dataset that will be preprocessed
        stopword_removal: If True, stopwords are removed from sentence
        
    Return: List of preprocessed sentences
    """
    preprocessed_sentences=[]
    for s in sentences:
        clean_s = clean_sentence(s,stopword_removal)
        preprocessed_sentences.append(clean_s)
    return preprocessed_sentences

preprocessed_corpus_sentences = get_preprocessed_sentences(corpus_sentences,stopword_removal=True)

for i in range(10):
    print(preprocessed_corpus_sentences[i])
    print('---------------')


---------------
ii crop production manual guide fruit vegetable production federated states micronesia
---------------
iii
---------------
iv crop production manual guide fruit vegetable production federated states micronesia compiled sayed mohammad naim khalid manual produced tcp mic 3601 strengthening capacity farmers associations increase production marketing root crops fruits vegetables fsm project fao subregional office pacific food agriculture organization united nations apia 2020
---------------
required citation fao 2020 crop production manual appia designations employed presentation material information product imply expression opinion whatsoever food agriculture organization united nations fao concerning legal development status country territory city area authorities concerning delimitation frontiers boundaries mention specific companies products manufacturers patented imply endorsed recommended fao preference similar nature mentioned views expressed information product aut

In [10]:
MODEL_NAME = "facebook/nllb-200-distilled-1.3B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/564 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/808 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

In [14]:
TARGET_LANGUAGE_ID = 'urd_Arab'
ur_corpus_sentences = []

for sent in corpus_sentences:
    if len(sent)==0:
        continue
    inp = tokenizer(sent, return_tensors="pt").to(device)
    pred = model.generate(**inp, forced_bos_token_id=tokenizer.lang_code_to_id[TARGET_LANGUAGE_ID], max_length=200)
    ur_sent = tokenizer.decode(pred[0], skip_special_tokens=True)
    ur_corpus_sentences.append(ur_sent)


In [15]:
ur_corpus = '\n'.join(ur_corpus_sentences)
with open('ur_kb.txt', 'w') as f:
    f.write(ur_corpus)