# MiVivienda_OCR

**Tema:** Extracción de Opiniones Vinculantes del Ministerio de Vivienda, Construcción y Saneamiento (MIVIVIENDA)

**Objetivo:** Extraer e interpretar la información de las opiniones vinculantes por año publicadas en la página web de MIVIVIENDA. Sería interesante realizar un resumen de la opinión principal emitida en cada opinión vinculante y organizarla en carpetas.

**Link:** https://ww3.vivienda.gob.pe/dv_/consultas-frecuentes.html

### Author: Esteban Cabrera (esteban.cabrera@pucp.edu.pe)
### January 2024

# Libraries

In [1]:
from PyPDF2 import PdfReader
import os
import glob
import pandas as pd
import re

In [2]:
from PIL import Image
from io import BytesIO
import pytesseract
# Specify the path where Tesseract-OCR was installed
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from pytesseract import Output
import re
import glob
import os
import PIL.Image
from PIL import Image
from pdf2image import convert_from_path

In [3]:
def convert_pdf_to_images( pdf_path, export_path, page_ranges ):
    """
    Converts specified pages from a PDF file into images and saves them in a specified directory.

    Input:
    - pdf_path (str)    : Path to the PDF file.
    - page_ranges (str) : Pages to be converted. Can be a single page (e.g., '5'),
                          a range of pages (e.g., '5-10'), or multiple ranges 
                          (e.g., '5-10, 15-18'). Pages must be specified in an 
                          asceding numerical order. 
    - export_path (str) : Base directory path where the image folder will be created.

    Output:
    - Saves the extracted pages as images in a subfolder named after the PDF file 
      within the given export path. Each image is named according to its page number 
      (e.g., 'page_5.jpg').
    """

    # Create a directory for the output images
    base_name  = os.path.splitext( os.path.basename( pdf_path ) )[ 0 ]
    output_dir = os.path.join( export_path, base_name )
    if not os.path.exists( output_dir ):
        os.makedirs( output_dir )

    # Parse the page ranges
    pages_to_convert = []
    for part in page_ranges.split(','):
        if '-' in part:
            start, end = part.split( '-' )
            pages_to_convert.extend( range( int( start ), int( end ) + 1 ) )
        else:
            pages_to_convert.append( int( part ) )

    # Convert the specified pages
    images = convert_from_path( pdf_path, first_page = min( pages_to_convert ), last_page = max( pages_to_convert ) )
    
    # Save the images
    for i, page in enumerate( pages_to_convert, start = 1 ):
        if i <= len( images ):
            images[ i-1 ].save( os.path.join( output_dir, f'page_{page}.jpg' ), 'JPEG' )

In [4]:
def convert_images_to_text( input_path, export_path, lang = 'eng' ):
    """
    Converts images in a given folder to text files using pytesseract and saves them in a new subfolder 
    within the export directory. The subfolder is named after the last directory in the input path.

    Input:
    - input_path (str) :  Base folder path where the images are located. 
                          We assume that all files are valid image files
                          ( .jpg, .png or .jpeg )
    - export_path (str) : Base folder path where the text files will be saved.
    - lang (str)        : Language for pytesseract to use (default is English - 'eng').

    Output:
    - Creates a new subfolder in the export directory named after the last directory of the input path.
      For each image in the input folder, a corresponding text file is created in this subfolder.
    """

    # Get the last directory name from input_path
    last_dir_name = os.path.basename( os.path.normpath( input_path ) )

    # Create a new subdirectory in the export directory
    new_export_path = os.path.join( export_path, last_dir_name )
    if not os.path.exists( new_export_path ):
        os.makedirs( new_export_path )

    # Process each image in the input directory
    for filename in os.listdir( input_path ):
            
        # Read the image and extract text
        img_path = os.path.join( input_path, filename )
        img      = Image.open( img_path )
        text     = pytesseract.image_to_string( img, lang = lang )

        # Save the extracted text to a .txt file in the new subdirectory
        text_file_path = os.path.join( new_export_path, os.path.splitext( filename )[ 0 ] + '.txt' )
        with open( text_file_path, 'w', encoding = 'utf-8' ) as file:
            file.write( text )

In [5]:
def find_origin_section( text, start_pattern, end_pattern ):
    """
    Extracts the origin section from the given text using start 
    and end regex patterns.
    """
    start_match = re.search( start_pattern, text )
    end_match   = re.search( end_pattern, text )
    if start_match and end_match:
        return text[ start_match.end() : end_match.start() ].strip()
        
    return None

def process_reports( base_path, start_pattern, end_pattern ):
    """
    Processes each report in the base_path to extract the origin section 
    and store in a DataFrame.
    """
    reports = []
    for folder in os.listdir( base_path ):
        folder_path = os.path.join( base_path, folder )
        
        if os.path.isdir( folder_path ):
            page_path = os.path.join( folder_path, 'page_3.txt ')
            
            if os.path.exists( page_path ):
                with open(page_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    origin_section = find_origin_section( text, start_pattern, end_pattern )
                    reports.append( { 'Report Name': folder, 'Origin Section': origin_section } )
    
    return pd.DataFrame( reports )

# Code

In [6]:
def pdf_to_text(pdf_path):
    ''' 
    Objective:
        This functions transforms a pdf to a text where we can apply text information retrieval

    Input: 
        pdf_path (str) : The path where the pdf is located, including the pdf name.

    Output:
        It returns the text of the pdf  
    '''    
    reader   = PdfReader( pdf_path )
    n_pages  = len( reader.pages )
    print( f'Number of pages: { n_pages }' )

    try:
        extracted_text = [ reader.pages[ i ].extract_text() for i in range( n_pages ) ]
        print( 'Text successfully extracted' )
    
    except:
        extracted_text = []
        print( 'Text not found' )


    combined_text = '\n'.join( extracted_text )


    return combined_text    

In [7]:
def ocr_mivivienda( combined_text , start_pattern , end_pattern):
    '''
    Objective:
        This function takes a text and extracts the patter indicated by the start_patter and end_patter inputs.
    
    Input:
        combined_text (str) : The text where we can extract information.

        start_pattern (str) : The starting pattern.

        end_pattern (str) : The ending pattern.
    '''      
    start_match   = re.search( start_pattern, combined_text, re.IGNORECASE )
    end_match     = re.search( end_pattern, combined_text[ start_match.end(): ], re.IGNORECASE )
    
    end_index     = start_match.end() + end_match.start()
    article_text  = combined_text[ start_match.end(): end_index ].strip()

    article_text = article_text.split('\n')
    
    return article_text   
    

We try both functions with one text

In [32]:
text_mivivienda = pdf_to_text('../OCR proposal/files/ITL N° 003-2022-VIVIENDA_VMVU-DGPRVU-DV-JLHP-KCG.pdf')
text_mivivienda[:1000]

Number of pages: 15
Text successfully extracted


'"Decenio de la Igualdad de oportunidades para mujeres y hombres" \n“Año  del Fortalecimiento de la Soberanía Nacional”  \n \n \n \n \nSan Isidro, 08 de febrero de 2022 \n \nOFICIO Nº 047-2022-VIVIENDA/VMVU-DGPRVU \n \n \nArquitecta \nLOURDES GIUSTI HUNDSKOPF \nDecana Regional Lima \nColegio de Arquitectos del Perú \nAv. San Felipe N° 999 \nJesús María . - \n \nAsunto:   Opinión vinculante sobre la altura de edificación generada por colindancia, \nconsolidación, compensación y por ubicarse el lote en esquina, de acuerdo \na lo regulado en el artículo 10 de la Norma Técnica A.010, Condiciones \ngenerales de diseño del Reglamento Nacional de Edificaciones \n \nReferencia: H.T. N° 00014371 -2022 \n \nPor medio del presente me dirijo a usted con la finalidad de remitir opinión vincul ante sobre \nla aplicación de la altura de edificación generada por colindancia, consolidación, \ncompensación y por ubicarse el lote en esquina, señaladas en el artículo 10 de la N orma \nTécnica A.010, Condi

In [33]:
ocr_mivivienda( text_mivivienda, r'IV. CONCLUSIONES', r'Atentamente,')

[':  ',
 ' ',
 '4.1. El RNE es la norma técnica de cumplimiento obligatorio por todas las entidade s ',
 'públicas, así como por las personas naturales y jurídicas de derecho público y privado ',
 'que proyecten o ejecuten habilitaciones urbanas y edificaciones en el territori o ',
 'nacional, al ser el único marco técnico normativo que establece los criterios y ',
 'requisitos mínimos para el diseño y la ejecución de proyectos de habilitación urbana ',
 'y/o de edificaciones. ',
 ' ',
 '4.2. El Certificado de Parámetros Urbanísticos y Edificatorios es el documento previo ',
 'emitido por las municipalidades, donde se especifican los parámetros urbanísticos y ',
 'edificatorios de diseño que regula el proceso de edificación sobre un predio urbano; ',
 'el cual contiene, entre otros, la altura máxima y mínima de edificación expresada en ',
 'metros. Sin embargo, en caso la altura se encuentra indicada en pisos, corresponde ',
 'la aplicación de lo establecido en el numeral 10.1 del artí

Now we try both functions with the the pdfs the files path

In [22]:
folder_path = '../OCR proposal/files'

data_dict = {'Filename': [], 'List_PDF': []}

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    try:
        file_pdf = pdf_to_text(file_path)

    except:
        print(f'{filename} is not a file')
        continue        

    list_pdf = ocr_mivivienda( file_pdf, r'IV. CONCLUSIONES', r'Atentamente,')

    data_dict['Filename'].append(filename)
    data_dict['List_PDF'].append(list_pdf)

final_dataframe = pd.DataFrame(data_dict)

Number of pages: 15
Text successfully extracted
Number of pages: 22
Text successfully extracted
Number of pages: 15
Text successfully extracted
Number of pages: 20
Text successfully extracted
ITL N° 068-2019-VIVIENDA_VMVU-DGPRVU-DV-JJLL-KCG is not a file
scanned is not a file
text is not a file


In [23]:
final_dataframe

Unnamed: 0,Filename,List_PDF
0,ITL N° 003-2022-VIVIENDA_VMVU-DGPRVU-DV-JLHP-K...,"[: , , 4.1. El RNE es la norma técnica de cu..."
1,ITL N° 004-2022-VIVIENDA_VMVU-DGPRVU-DV-JJLL-J...,[ De acuerdo con el principio de predictibili...
2,ITL N° 034-2022-VIVIENDA_VMVU-DGPRVU-DV-EMJ-JL...,"[: , , 4.1 El literal c) del artículo 36 de l..."
3,ITL N° 063-2023-VIVIENDA_VMVU-DGPRVU-DV-JJLL-K...,[Sin perjuicio de las aclaraciones que el Trib...


In [24]:
final_dataframe.to_excel('MiVivienda.xlsx')

We can also try with scanned documents

In [25]:
input_path  = '../OCR proposal/files/scanned/ITL N° 068-2019-VIVIENDA_VMVU-DGPRVU-DV-JJLL-KCG.pdf'
output_path = '../OCR proposal/files/'
page_ranges = '1-11'

In [26]:
convert_pdf_to_images( input_path, output_path, page_ranges )

In [27]:
input_path  = '../OCR proposal/files/ITL N° 068-2019-VIVIENDA_VMVU-DGPRVU-DV-JJLL-KCG/'
output_path = '../OCR proposal/files/text'
lang        = 'spa'

In [28]:
convert_images_to_text( input_path, output_path, lang )

In [29]:
base_path     = '../OCR proposal/files/text'
start_pattern = r'CONCLUSIONES:'
end_pattern   = r'29090'
df            = process_reports( base_path, start_pattern, end_pattern )

In [18]:
df  

Unnamed: 0,Report Name,Origin Section
0,ITL N° 068-2019-VIVIENDA_VMVU-DGPRVU-DV-JJLL-KCG,
