<a href="https://colab.research.google.com/github/fghadami/DS_personal_projects/blob/main/omdena_Fateme_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font size="5">**Installing Packages**</font>

In [12]:
!pip install PyPDF2
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install langdetect
!pip install textblob

from IPython.display import clear_output
clear_output()

In [13]:
import zipfile
from pathlib import Path
import os

import spacy
import pandas as pd

import PyPDF2

import re
from langdetect import detect
from textblob import TextBlob

<font size="5">**Helper Functions**</font>

In [44]:
def unzip_file(zip_file_path, extraction_folder):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_folder)

    print("Unzipping complete.")

In [45]:
def extract_text_from_each_pdf(pdf_file_path):
    text = ''
    with open(pdf_file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page_num in range(len(reader.pages)):
          text += reader.pages[page_num].extract_text()
    return text

In [46]:
def prepare_data_for_pipeline(working_directory):
    # This function extract texts from all the PDF files in working_directory
    # and return them as a list
    all_files = [file for file in working_directory.iterdir() if file.is_file()]
    text_list = []
    for file in all_files:
        text = extract_text_from_each_pdf(file)
        text_list.append(text)
    filtered_text_list = [text for text in text_list if text != '']
    return filtered_text_list

In [58]:
def ensure_english(text):
    try:
        if detect(text) != 'en':
            raise ValueError("The text is not in English")
        return text
    except:
        return ''

def remove_excessive_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', text)

def correct_ocr_errors(text):
    text_blob = TextBlob(text)
    corrected_text = str(text_blob.correct())  # Basic correction using TextBlob
    return corrected_text

def extract_structure(text):
    # For simplicity, we assume that each paragraph is separated by a newline
    paragraphs = text.split('\n')

    # Extract heading-like structures (text in all caps)
    headings = [p for p in paragraphs if p.isupper()]

    # Return raw text, cleaned paragraphs, and extracted headings
    structured_data = {
        'raw_text': text,
        'paragraphs': paragraphs,
        'headings': headings
    }
    return structured_data

nlp = spacy.load("en_core_web_sm")

def get_named_entities(text):
  doc = nlp(text)
  return [(ent.text, ent.label_) for ent in doc.ents]

In [48]:
def clean_text_pipeline(text):
    text = ensure_english(text)
    text = remove_excessive_spaces(text)
    text = remove_special_characters(text)
    text = correct_ocr_errors(text)
    return text

<font size="5">**Main Part**</font>

In [49]:
zip_file_path = '/content/sample_pdfs.zip'
extraction_folder = '/content'
working_directory = Path.cwd() / 'sample_pdfs'

In [50]:
unzip_file(zip_file_path, extraction_folder)

Unzipping complete.


In [51]:
text_list = prepare_data_for_pipeline(working_directory)



In [54]:
clean_text_list = [clean_text_pipeline(text) for text in text_list]

In [55]:
df = pd.DataFrame(clean_text_list, columns=['clean_text'])

In [56]:
df['clean_text']

Unnamed: 0,clean_text
0,
1,Age 1 of 3 TEA RESEARCH INSTITUTE of SRI LANKA...
2,Cap 2531 SRI LANKA TEA BOARD CHAPTER 253 SRI L...
3,Age 1 of 3 TEA RESEARCH INSTITUTE of SRI LANKA...
4,
5,Age 1 of 2 TEA RESEARCH INSTITUTE of SRI LANKA...
6,1 TEA RESEARCH INSTITUTE of SRI LANKA Issued I...
7,Age 1 of 3 TEA RESEARCH INSTITUTE of SRI LANKA...
8,Ri Vanka Tea Board Amendment Act To 17 of 1985...
9,


In [59]:
df['named_entities'] = df['clean_text'].apply(get_named_entities)

In [60]:
df['named_entities']

Unnamed: 0,named_entities
0,[]
1,"[(Age 1, DATE), (3, CARDINAL), (TEA RESEARCH I..."
2,"[(Cap, PERSON), (2531, DATE), (SRI LANKA TEA B..."
3,"[(Age 1, DATE), (3, CARDINAL), (TEA RESEARCH I..."
4,[]
5,"[(Age 1, DATE), (December 2019, DATE), (Guidel..."
6,[(1 TEA RESEARCH INSTITUTE of SRI LANKA Issued...
7,"[(Age 1, DATE), (3, CARDINAL), (TEA RESEARCH I..."
8,"[(Ri Vanka Tea Board Amendment Act, ORG), (17,..."
9,[]
