### Using regular expressions

In [1]:
from urllib.request import urlopen

url = "https://www.plenainclusion.org/publicaciones/buscador/?_sfm_descargable_lectura_facil=1&sf_paged=1"
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")

In [2]:
import re

pattern = '<a href=".*?/">'
match_results = re.search(pattern, html, re.IGNORECASE)
url = match_results.group()
url = re.sub('".*?>', "", re.sub('<.*?"', "", url))

print(url)

https://www.plenainclusion.org/publicaciones/buscador/manual-para-participar-en-una-mesa-electoral-lectura-facil/


### Using BeautifulSoup

In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from tqdm import tqdm

def get_soup(url):
    page = urlopen(url)
    html = page.read().decode("utf-8")
    return BeautifulSoup(html, "html.parser")

In [4]:
%%time
# Getting the url to the content subpages

url = "https://www.plenainclusion.org/publicaciones/buscador/?_sfm_descargable_lectura_facil=1&sf_paged=1"
soup = get_soup(url)

i = 1
content_file_urls = []

while len(soup.find_all('h3', class_="card__title"))>0:
    
    # Get the html content
    url = "https://www.plenainclusion.org/publicaciones/buscador/?_sfm_descargable_lectura_facil=1&sf_paged=" + str(i)
    soup = get_soup(url)
    
    # Parse the url with beautifulsoup
    for card in soup.find_all('h3', class_="card__title"):
        content_url = card.find_all(['a'])[0]['href']
        content_soup = get_soup(content_url)
        content_file_urls.append(content_soup.find_all('a', class_="btn btn_principal btn_principal--naranja")[0]['href'])

    i += 1

Wall time: 6min 27s


In [5]:
from tqdm import tqdm

# Saving files list in memory
try:
    if content_file_urls:
        with open(r'C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\content_list.txt', 'w') as fp:
            for item in tqdm(content_file_urls):
                # write each item on a new line
                fp.write("%s\n" % item)
except Exception as e:
    print(e)
        
# Reading files list from memory
try:
    with open('C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\content_list.txt') as f:
        content_file_urls = f.read().splitlines()
except Exception as e:
    print(e)

100%|████████████████████████████████████████████████████████████████████████████| 679/679 [00:00<00:00, 339605.58it/s]


In [6]:
# 567 out of the 679 pages with 'lectura fácil' contains usable .pdf files

counter_pdf = 0

for url in content_file_urls:
    if '.pdf' in url:
        counter_pdf += 1

print(counter_pdf)

567


In [7]:
!pip install pip-system-certs
import requests
import os

# Saving pdf content from urls into memory
path_to_local_files = "C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\pdfs"
pdfs_with_errors = []

for url in tqdm(content_file_urls):
    if '.pdf' in url:
        try:
            response = requests.get(url)
            file_local_path = os.path.join(path_to_local_files, url.split('/')[-1])
            with open(file_local_path, 'wb') as f:
                f.write(response.content)

        except Exception as e:
            print(e)
            pdfs_with_errors.append(url)

Looking in indexes: https://pypi.org/simple, https://OelQQDud:****@alm.accenture.com/nexus/repository/ISCP_Components_Release/simple/


  0%|▏                                                                                 | 2/679 [00:01<06:46,  1.67it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  2%|█▍                                                                               | 12/679 [00:20<13:20,  1.20s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  3%|██▌                                                                              | 22/679 [00:43<40:58,  3.74s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  5%|████                                                                             | 34/679 [01:21<26:55,  2.50s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  5%|████▏                                                                            | 35/679 [01:21<21:27,  2.00s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  7%|█████▎                                                                           | 45/679 [01:31<06:51,  1.54it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  7%|█████▍                                                                           | 46/679 [01:32<06:26,  1.64it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  7%|█████▋                                                                           | 48/679 [01:33<05:59,  1.76it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  7%|█████▊                                                                           | 49/679 [01:33<05:28,  1.92it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  9%|██████▉                                                                          | 58/679 [01:42<06:39,  1.55it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  9%|███████                                                                          | 59/679 [01:43<05:42,  1.81it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  9%|███████▏                                                                         | 60/679 [01:43<05:00,  2.06it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  9%|███████▎                                                                         | 61/679 [01:43<04:26,  2.32it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 15%|████████████▏                                                                   | 103/679 [02:40<09:08,  1.05it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 18%|██████████████▌                                                                 | 124/679 [02:53<03:54,  2.36it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 19%|███████████████                                                                 | 128/679 [02:53<02:09,  4.25it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 20%|███████████████▊                                                                | 134/679 [02:55<01:53,  4.82it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 20%|███████████████▉                                                                | 135/679 [02:55<01:53,  4.78it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 24%|██████████████████▊                                                             | 160/679 [03:23<13:22,  1.55s/it]

[Errno 22] Invalid argument: 'C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\pdfs\\GUIA.%20La%20consulta%20de%20ginecologia.LF.pdf?fbclid=IwAR37Nd5dWzHu3mUiNUadwvRYPgDhntiOL19Vm5jp8xRfcxS2UXH8VWFIz5k'


 24%|██████████████████▉                                                             | 161/679 [03:25<13:47,  1.60s/it]

[Errno 22] Invalid argument: 'C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\pdfs\\GUIA.%20Cuidar%20y%20conocer%20tu%20sexualidad.%20LF.pdf?fbclid=IwAR3tb6PBWYakuDKx2zLou5m1zUmIzvTdJY8J6UU5sO7szC7UCJZMjOrnyGU'


 24%|███████████████████                                                             | 162/679 [03:27<16:00,  1.86s/it]

[Errno 22] Invalid argument: 'C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\pdfs\\GUIA.%20Menstruacion%20y%20menopausia.LF.pdf?fbclid=IwAR2v8KaA4AOK7kOb36BvjvYFOJB-i2CbudcEcsPJNppchmoJlKTI8wBfsUc'


 24%|███████████████████▍                                                            | 165/679 [03:32<12:03,  1.41s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 25%|███████████████████▋                                                            | 167/679 [03:32<08:18,  1.03it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 26%|████████████████████▉                                                           | 178/679 [03:52<11:54,  1.43s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 27%|█████████████████████▋                                                          | 184/679 [03:57<06:15,  1.32it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 28%|██████████████████████                                                          | 187/679 [04:00<06:50,  1.20it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 38%|██████████████████████████████▍                                                 | 258/679 [05:26<36:58,  5.27s/it]

HTTPSConnectionPool(host='www.saludcastillayleon.es', port=443): Max retries exceeded with url: /AulaPacientes/es/guias-aula/guias-lectura-facil.ficheros/1881402-Lectura%20F%C3%A1cil%20Gu%C3%ADa%20para%20personas%20con%20asma.pdf (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129)')))


 39%|███████████████████████████████                                                 | 264/679 [05:30<09:31,  1.38s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 39%|███████████████████████████████▎                                                | 266/679 [05:31<06:58,  1.01s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 40%|███████████████████████████████▉                                                | 271/679 [05:37<07:15,  1.07s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 40%|████████████████████████████████                                                | 272/679 [05:38<06:00,  1.13it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 46%|████████████████████████████████████▉                                           | 313/679 [06:33<05:53,  1.04it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 49%|██████████████████████████████████████▉                                         | 330/679 [06:56<07:42,  1.33s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 55%|███████████████████████████████████████████▋                                    | 371/679 [07:35<04:59,  1.03it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 63%|██████████████████████████████████████████████████▊                             | 431/679 [09:14<01:22,  2.99it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 64%|███████████████████████████████████████████████████▎                            | 436/679 [09:18<02:23,  1.70it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 66%|█████████████████████████████████████████████████████▏                          | 451/679 [09:46<05:58,  1.57s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 67%|█████████████████████████████████████████████████████▊                          | 457/679 [09:59<06:00,  1.62s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 68%|██████████████████████████████████████████████████████▎                         | 461/679 [10:08<06:30,  1.79s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 68%|██████████████████████████████████████████████████████▍                         | 462/679 [10:08<04:52,  1.35s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 69%|██████████████████████████████████████████████████████▉                         | 466/679 [10:12<03:18,  1.07it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 69%|███████████████████████████████████████████████████████▏                        | 468/679 [10:13<02:55,  1.20it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 72%|█████████████████████████████████████████████████████████▍                      | 487/679 [10:38<04:11,  1.31s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 72%|█████████████████████████████████████████████████████████▋                      | 490/679 [10:39<02:01,  1.56it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 85%|████████████████████████████████████████████████████████████████████            | 578/679 [13:18<01:30,  1.11it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 85%|████████████████████████████████████████████████████████████████████▏           | 579/679 [13:18<01:15,  1.32it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 87%|█████████████████████████████████████████████████████████████████████▎          | 588/679 [13:35<02:08,  1.41s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 93%|██████████████████████████████████████████████████████████████████████████▋     | 634/679 [14:27<01:28,  1.97s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 94%|███████████████████████████████████████████████████████████████████████████▌    | 641/679 [14:31<00:21,  1.77it/s]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 97%|█████████████████████████████████████████████████████████████████████████████▌  | 658/679 [15:00<00:36,  1.75s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 98%|██████████████████████████████████████████████████████████████████████████████▋ | 668/679 [15:20<00:12,  1.10s/it]

('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


100%|████████████████████████████████████████████████████████████████████████████████| 679/679 [15:43<00:00,  1.39s/it]


In [8]:
# 53 out of the 567 pdf files had some kind of problem when loading them

print(len(pdfs_with_errors))

with open(r'C:\\Users\\fernando.rubio.perez\\Jupyter projects\\TFM\\files_in_lectura_facil\\plenainclusion\\pdfs_with_errors.txt', 'w') as fp:
    for item in tqdm(pdfs_with_errors):
        # write each item on a new line
        fp.write("%s\n" % item)

53


100%|██████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:00<?, ?it/s]
