In [3]:
import pandas as pd
import pdfplumber
import requests
import re
from timeit import default_timer as timer

### Functions

In [3]:
def download_pdf(url):
    session_obj = requests.Session()
    response = session_obj.get(url, headers={"User-Agent": "Mozilla/5.0"})
    file = response.content

    # Putting the file in a temporary directory.
    filename = Path('./tmp/temp.pdf')
    filename.write_bytes(file)

In [1]:
def pdf2text(legislature):
    with pdfplumber.open('./tmp/temp.pdf') as pdf:
        text = "" # Temporary string where all text goes.
        tp = len(pdf.pages) # Total number of pages.
        start = timer()

        if legislature > 10:
            for i in range(tp):
                page = pdf.pages[i]
                # Crop the area of the page corresponding to the text itself.
                page = page.crop((0, 0.12 * float(page.height), 0.90 * float(page.width), 0.93 * float(page.height)))

                # Extract the text from the selected area.
                new_text = page.extract_text() + '\n'
                
                text += new_text

        elif legislature > 5 & legislature < 10:
            for i in range(tp):
                page = pdf.pages[i]
                # Crop the area of the page corresponding to the text itself.
                left_half = page.crop((0, 0.08 * float(page.height), 0.5 * float(page.width), 0.93 * float(page.height)))
                right_half = page.crop((0.50 * float(page.width), 0.08 * float(page.height), 0.95 * float(page.width), 0.93 * float(page.height)))

                # Extract the text from the selected area.
                new_text = left_half.extract_text() + '\n'
                new_text += right_half.extract_text() + '\n'
                text += new_text

        else:
            print('Warning: Invalid legislature. The code does not support PDF scrapping for legislatures 1 until 5.')

    end = timer()
    print(f'Time for PDF2Text extract_text(): {end - start} seconds')
    return text

In [2]:
def cleantext(text):

    # Sanitize "\u2002" by changing them for " ".
    text = text.replace('\u2002', ' ')

    # Eliminate double spaces.
    text = text.replace('  ', ' ')

    # Eliminate hyphens that separate two paragraphs.
    regex = "[a-zA-ZñáéíóúüàèìòùçÑÁÉÍÓÚÜÀÈÌÒÙÇ](\-\n)"
    text = re.sub(regex, '', text)
    
    return text

In [313]:
def text2dict(cleaned):
    '''
    Convers the clean text obtained from the PDF into a dictionary with structure: {topic} : {{speaker} : {text}}.
    Cleans several errors such as double spaces and new lines. The dictionary omits the titles of the topics but keeps their id.
    '''

    # Split by topic excluding the titles of the topics (capitalized words starting with a hiphen, ending with the file number).
    splitted_by_topic = re.split(
        '[—–\-A-Z /\\n\n,.\d?¿:;!¡ÑÇÁÉÍÓÚÜÀÈÌÒÙ)(]{30,1000}\([NúÚmMeErRoOdDxXpPiInNtT \n\\n]{21,26}([\d]{3}/[\d]{6})\).[\n\\n]{0,4}', cleaned)

    # Remove the first item, which is always the summary of the session.
    splitted_by_topic.pop(0)

    # Split for each new speaker.
    splitted_topic_speaker = [re.split(
        '([ ]{0,3}[ElLa]{2} señor[\w]{0,1} [A-ZÑÁÉÍÓÚÜÀÈÌÒÙÇ\n\-, ]{2,150}[() A-Za-zñáéíóúüàèìòùç\-\n,]{0,50}:)', line) 
        for line in splitted_by_topic]

    # Turn the splitted lists into a dictionary {topic} : {rest}
    keys = []
    values = []
    for i in range(0, len(splitted_topic_speaker), 2):
        keys.append((splitted_topic_speaker[i][0]))
        values.append(splitted_topic_speaker[i+1])

    # dic = dict(zip(keys, values))
    # Changed the way the dictionary was created due to topics being tackled twice in the same session (debate & voting).
    dic = {}

    for i in range(len(keys)):
        if keys[i] in dic.keys():
            # We extend from 1: in order to avoid the initial empty value due to the regex split.
            dic[keys[i]].extend(values[i][1:])
        else:
            dic[keys[i]] = values[i]

    # Cleaning up the texts.
    for key, value in dic.items():
        # Remove all the first elements from the values in the dictionary, which are empty.
        dic[key].pop(0)

        
        for i in range(0, len(value) - 1):
            if len(dic[key][i]) > 0:
                # Remove all the newlines from each intervention.
                dic[key][i] = value[i].replace('\n', ' ')

                # Remove the double spaces generated from the previous replacement.
                dic[key][i] = value[i].replace('  ', ' ')

                # If the beginning of the sentence is an empty string, then remove it.
                if dic[key][i][0] == ' ':
                    dic[key][i] = value[i][1:]

                # If the end of the sentence is an empty string, then remove it.
                li = len(dic[key][i]) - 1
                if dic[key][i][li] == ' ':
                    dic[key][i] = value[i][:li]

    # Merging interventions by speaker for the same topic and turning them into a dict.
    # {topic} : {{speaker} : {text}}
    for key in dic:
        speakers = []
        texts = []
        results = {}

        for i in range(len(dic[key])):
            if (i % 2) == 0:
                speakers.append(dic[key][i])
            else:
                texts.append(dic[key][i])

        for i in range(len(speakers)):
            if speakers[i] in results.keys():
                results[speakers[i]] = results[speakers[i]] + '\n' + texts[i]
            else:
                results[speakers[i]] = texts[i]

        dic[key] = results

    return dic

In [450]:
def obtain_texts(data):
    timestamp = datetime.datetime.now()
    texts = []
    previous_url = ''

    for row in range(len(data)):
        # The speaker's surname, topic id and url of the intervention.
        surname = data.loc[row]['orador'].split(',')[0]
        topic = data.loc[row]['numero_expediente'][0:10]
        url = data.loc[row]['enlace_pdf']

        if url != previous_url:
            # Perform all the necessary steps.
            start = timer()
            download_pdf(url)
            end = timer()
            print(f'Time to download pdf row {row}: {end - start} seconds')

            start = timer()
            pdf_text = pdf2text()
            end = timer()
            print(f'Time for pdf2text row {row}: {end - start} seconds')

            start = timer()
            cleaned = cleantext(pdf_text)
            end = timer()
            print(f'Time for cleantext row {row}: {end - start} seconds')

            start = timer()
            processed = text2dict(cleaned)
            end = timer()
            print(f'Time for text2dict row {row}: {end - start} seconds')
            previous_url = url

        count = 0
        text = ''

        if topic in processed.keys():
            for item in processed[topic].keys():
                if surname.lower() in item.lower():
                    text = processed[topic][item]
                    count += 1

        # Making sure there are no duplicates in the interventions.
        if count > 1: print(f'A speaker appeared two times in row {row}')
        if row % 10 == 0:
            print(f'{row} time: {datetime.datetime.now() - timestamp}')
            timestamp = datetime.datetime.now()

        texts.append(text)

    return texts

### Other  code

In [511]:
# Import data. In this case legislatures X to XIV.
data = pd.read_csv('data/interventions/XII.csv', index_col=0)
pleno_xii = data.loc[data['nombre_sesion'] == 'Pleno'].dropna().reset_index(drop=True)

In [513]:
texts = obtain_texts(pleno_xii)
pleno_xii['text'] = texts

Time to download pdf row 0: 0.1796572399980505 seconds
Time for PDF2Text extract_text(): 0.4395731610056828 seconds
Time for pdf2text row 0: 0.5286972219982999 seconds
Time for cleantext row 0: 9.603900252841413e-05 seconds
Time for text2dict row 0: 0.0008215289999498054 seconds
0 time: 0:00:00.714166
Time to download pdf row 1: 0.27401795200421475 seconds
Time for PDF2Text extract_text(): 44.25690431900148 seconds
Time for pdf2text row 1: 44.43229746900033 seconds
Time for cleantext row 1: 0.004007583003840409 seconds
Time for text2dict row 1: 0.07898362899868516 seconds
10 time: 0:00:44.800409
Time to download pdf row 19: 0.6929426489950856 seconds
Time for PDF2Text extract_text(): 4.665154009999242 seconds
Time for pdf2text row 19: 4.705754716000229 seconds
Time for cleantext row 19: 0.0004319990039221011 seconds
Time for text2dict row 19: 0.0154170099995099 seconds
20 time: 0:00:05.421231
30 time: 0:00:00.002597
Time to download pdf row 34: 0.24726191500667483 seconds
Time for PDF2

In [516]:
pleno_xii.to_csv('data/interventions/plenos-xii.csv')

In [526]:
pleno_xiv

Unnamed: 0,legislatura,fecha,objeto_iniciativa,numero_expediente,autores,nombre_sesion,orador,enlace_pdf,text
0,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Borràs Castanyer, Laura (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Presidenta, señorías, empieza el año y volvemo..."
1,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Baldoví Roda, Joan (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Moltes gràcies, senyora presidenta. Hay alguno..."
2,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Quevedo Iturbe, Pedro (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Señora presidenta, señorías, señor candidato a..."
3,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Oramas González-Moro, Ana María (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, señora presidenta. Señorías, como dij..."
4,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Sabanés Nadal, Inés (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Señorías, a pesar de la e..."
...,...,...,...,...,...,...,...,...,...
12528,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Díaz Gómez, Guillermo (GCs)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Hoy traen aquí tarde, muy..."
12529,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Sagastizabal Unzetabarrenetxea, Idoia (GV (EAJ...",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Eguerdi on. Nos encontram..."
12530,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Aizpurua Arzallus, Mertxe (GEH Bildu)",https://www.congreso.es:443/public_oficiales/L...,"Arratsalde on, buenas tardes. La vicepresident..."
12531,XIV,2023-01-24,"Solicitud de comparecencia, a petición propia,...",210/000122/0000,Gobierno,Pleno,"Baldoví Roda, Joan (GPlu)",https://www.congreso.es:443/public_oficiales/L...,


In [521]:
pleno_xiv.loc[pleno_xiv['text'].str.contains('derechos')]

Unnamed: 0,legislatura,fecha,objeto_iniciativa,numero_expediente,autores,nombre_sesion,orador,enlace_pdf,text
0,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Borràs Castanyer, Laura (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Presidenta, señorías, empieza el año y volvemo..."
1,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Baldoví Roda, Joan (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Moltes gràcies, senyora presidenta. Hay alguno..."
2,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Quevedo Iturbe, Pedro (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Señora presidenta, señorías, señor candidato a..."
4,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Sabanés Nadal, Inés (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Señorías, a pesar de la e..."
7,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Asens Llodrà, Jaume (GCUP-EC-GC)",https://www.congreso.es:443/public_oficiales/L...,"Presidenta, señorías, hoy se ha hablado mucho ..."
...,...,...,...,...,...,...,...,...,...
12512,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Martínez Oblanca, Isidro Manuel (GMx)",https://www.congreso.es:443/public_oficiales/L...,"Muchas gracias, señora presidenta. Buenas tard..."
12515,XIV,2023-01-24,"Real Decreto-ley 20/2022, de 27 de diciembre, ...",130/000092/0000,Gobierno,Pleno,"Bolaños García, Félix",https://www.congreso.es:443/public_oficiales/L...,Buenos días a todos y a todas. Subo a esta tri...
12517,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Lamuà Estañol, Marc (GS)",https://www.congreso.es:443/public_oficiales/L...,"Moltes gràcies, presidenta. Se lo prometió en ..."
12529,XIV,2023-01-24,"Real Decreto-ley 1/2023, de 10 de enero, de me...",130/000093/0000,Gobierno,Pleno,"Sagastizabal Unzetabarrenetxea, Idoia (GV (EAJ...",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Eguerdi on. Nos encontram..."


In [524]:
pleno_xiv.loc[12530]

legislatura                                                        XIV
fecha                                                       2023-01-24
objeto_iniciativa    Real Decreto-ley 1/2023, de 10 de enero, de me...
numero_expediente                                      130/000093/0000
autores                                                       Gobierno
nombre_sesion                                                    Pleno
orador                           Aizpurua Arzallus, Mertxe (GEH Bildu)
enlace_pdf           https://www.congreso.es:443/public_oficiales/L...
text                 Arratsalde on, buenas tardes. La vicepresident...
Name: 12530, dtype: object