# Extract textual data from HTML

In [41]:
!pip install beautifulsoup4 # Install the library Beautiful Soup



In [42]:
from bs4 import BeautifulSoup as BS
import re, os

Creat a BeautifulSoup object, essentially a parsed document containing all the information of information passed in, which is the HTML file in our case.

In [43]:
# the file location requires modifcation depending on where you save the BOE.html file
os.chdir('/content/drive/MyDrive/CivilCode_Spanish/')
with open('BOE.html','r') as file:
    soup = BS(file,'html.parser')

After examination, we found that all textual data needed are saved in three tags: \<h4>, \<h5>, and \<p>.\
\
So we will use .find_all() method of the BeautifulSoup object to filter all these tag objects into a new object.


In [44]:
command = re.compile('(h(4|5))|^p')

In [45]:
needed_tags = soup.find_all(command)

In [46]:
# A preview of filtered tags
needed_tags

[<p class="fuera">Puede seleccionar otro idioma:</p>,
 <p><a class="inline" href="/index.php#diarios">Diarios Oficiales</a></p>,
 <p><a href="/index.php#juridico">Información Jurídica</a></p>,
 <p><a href="/index.php#servicios-adicionales">Otros servicios</a></p>,
 <p class="siempreSeVe">Téngase en cuenta que las referencias hechas a la llamada "adopción plena" se entienden sustituidas por la adopción regulada en la Ley 21/1987, de 11 de noviembre, según establece el artículo 3 de la citada Ley. <a href="/buscar/doc.php?id=BOE-A-1987-25627" target="_blank">Ref. BOE-A-1987-25627</a>.</p>,
 <p class="parrafo">Teniendo presente lo dispuesto en la ley de 26 de Mayo último; conformándome con lo propuesto por el Ministro de Gracia y Justicia, y de acuerdo con el parecer de mi Consejo de Ministros;</p>,
 <p class="parrafo">En nombre de mi Augusto Hijo el Rey D. Alfonso XIII, y como Reina Regente del Reino,</p>,
 <p class="parrafo">Vengo en decretar que se publique e inserte en la Gaceta de Ma

Next step is to extract only paragraph information from all these tags.\
\
Note that we use .get_text() methods instead of .string, which is native and intuitive for BeutifulSoup.\
\
The reason is that the latter does not handle well superscript tags within the text.\
\
For example, if a tag object in BeautifulSoup is\
\<p class="parrafo_2">1.<sup>a</sup> Será ley personal la  determinada por la vecindad civil.\</p>\
\
Using .string will return\
"None".\
\
While using .get_text() you get\
"1.a Será ley personal la  determinada por la vecindad civil."\
for which we only need to format the "a" later.

In [60]:
spanish_raw = open('spanish_raw.txt','w')
for n in needed_tags:
    text = n.get_text()
    spanish_raw.write(text + '\n')
spanish_raw.close()

# Clean the raw text

In [61]:
raw_text = open('spanish_raw.txt','r').read()

In [62]:
# delete extra whitespace
whitespace = re.sub(' {2,}',' ',raw_text)

In [63]:
# replace all suprimidir with derogar in order to maintain certain linguistic consistency
Derogar = re.sub('Suprimid','Derogad',whitespace)
derogar = re.sub('suprimid','derogad',Derogar)

In [64]:
# change .a to .ª
superscript = re.sub('(\d).a','\g<1>.ª',derogar)

In [65]:
with open('spanish_cleaned.txt','w') as Writer:
    Writer.write(superscript)
Writer.close()

# Structure cleaned text into separate files and folders

In [66]:
cleaned_text = open('spanish_cleaned.txt','r').read()

In [67]:
# Split all text by 'LIBRO' into a list.
# Each item of the list then is the content of a LIBRO, apart from the 'Título Preliminar'
libro = cleaned_text.split('LIBRO')

In [68]:
dict0 = libro[0].split('TÍTULO') # There are unwanted information before the needed text
titulopreliminar = 'TÍTULO' + dict0[1]

In [69]:
dict4 = libro[4].split('DISPOSICIÓN FINAL')
disposicionfinal = 'DISPOSICIÓN FINAL' + dict4[1]

In [70]:
preliminar = open('Título preliminar.txt','w')
preliminar.write(titulopreliminar)
preliminar.close()

In [72]:
final = open('Disposiciones final y adicionales.txt','w')
final.write(disposicionfinal)
final.close()

In [74]:
Rest = [libro[1],libro[2],libro[3],dict4[0]]

In [76]:
# automatically split the rest of the text into different "Libros" and "Títulos"
# and save them in according folders and files
y = 1
for libro in Rest:
    namefolder = 'Libro_' + str(y)
    y = y + 1
    os.mkdir('%s' % namefolder)
    os.chdir('%s' % namefolder)
    titulos = libro.split('TÍTULO')
    x = 1
    for titulo in titulos[1:]:
        text = 'TÍTULO' + titulo
        name = 'Título_' + str(x)
        file = open('%s.txt' % name, 'w')
        file.write(text)
        file.close()
        x = x + 1
    os.chdir('..')
print('Done')

Done
