In [1]:
'''
This module provides the necessary functions for translating
data from original .txt files into python objects, namely dictionaries
and lists.
'''

from collections import defaultdict

import locale
import datetime


locale.setlocale(locale.LC_ALL, 'pt_pt.UTF-8')


import re

In [2]:
def get_data(file):
    """Opens given file and returns a list in which
    each value is a string formed by lines in the
transcript of parliamentary debates.
    """

    data = []
    for line in open(file, 'r', encoding = "utf8"):
        strlst = line.strip().split('\n')
        data.append(strlst)
        
    return data

In [3]:
def read_text(lst):
    """Takes a list of strings that correspond
    to lines on a file, and aggregates them based
    on whether they are part of the same intervention.
    Returns a list of individual interventions in the
    Parliament.
    """

    interventions = [['']]

    local_count_interventions = 0


    for entry in lst:
        if entry[0] == '':
            pass
    
        elif ": —" in entry[0] or ": -" in entry[0]:
            interventions.append([entry[0] + ' '])
            local_count_interventions += 1
        
        # If these are the first lines of the document, I just ignore. Otherwise, I append
        # existing lines to the ongoing intervention.
        else:
            if local_count_interventions == 0:
                pass
            else:
                interventions[local_count_interventions][0] += (entry[0] + '  ')

    return interventions

In [4]:
def correct_interventions(lst):
    """Takes a list of strings that correspond
    to lines on a file, and aggregates them based
    on whether they are part of the same intervention.
    Returns a list of individual interventions in the
    Parliament.
    """

    # Now, I correct for those interventions missing a separating paragraph.
    interventions_corrected = [['']]
    count_interv = 0
    
    # I look at each naive intervention, and count how many interventions I should have.
    for indx, interv in enumerate(lst):
        nr_new_interv = interv[0].count(': —') + interv[0].count(': -')
        new_interv = interv
        
        
        # For each of these interventions, I look at whether there is content
        # that should belong to a previous intervention.
        
        beg_new_interv = max(new_interv[0].find(': —'), 0) + max(new_interv[0].find(': -'), 0)

        # I check to see the last punctuation mark.
        excl = new_interv[0][0 : beg_new_interv - 5].rfind('!')
        intr = new_interv[0][0 : beg_new_interv - 5].rfind('?')
        retic = new_interv[0][0 : beg_new_interv - 5].rfind('…')
        
        # For a full stop, I need to make sure I do not incorrectly see "Sr.".
        
        formal_address = max(new_interv[0][0 : beg_new_interv - 5].rfind('Sr.'), \
                             new_interv[0][0 : beg_new_interv - 5].rfind('S.'), \
                             new_interv[0][0 : beg_new_interv - 5].rfind('Sra.'), \
                             new_interv[0][0 : beg_new_interv + 3].rfind('Orador'), 0)
        stop = new_interv[0][0 : formal_address].rfind('.')

        end_previous_interv = max(excl, intr, retic, stop, 0)

        previous_interv = [new_interv[0][0 : end_previous_interv + 1]]
        new_interv = [new_interv[0][end_previous_interv : ]]
    
        if end_previous_interv > 0:
            interventions_corrected[count_interv][0] += (previous_interv[0] + '  ')        


        # Now, I go through the intervention to separate potential
        # multiple new interventions.
        local_interv = new_interv
        for i in range(nr_new_interv):
            
            if i == nr_new_interv - 1:
                interventions_corrected.append(local_interv)
                count_interv += 1

            elif i < nr_new_interv - 1:

                beg_current_interv = max(local_interv[0].find(': —'), 0) + max(local_interv[0].find(': -'), 0)
                # I sum the beg_current_interv because my search starts only after beg_current_interv.
                beg_following_intervs = max(local_interv[0][beg_current_interv + 3 : ].find(': —'), 0) \
                    + max(local_interv[0][beg_current_interv + 3 : ].find(': -'), 0) + beg_current_interv

                # I check to see the last punctuation mark.
                excl = local_interv[0][0 : beg_following_intervs - 5].rfind('!')
                intr = local_interv[0][0 : beg_following_intervs - 5].rfind('?')
                retic = local_interv[0][0 : beg_following_intervs - 5].rfind('…')
                
                # For a full stop, I need to make sure I do not incorrectly see "Sr.".
        
                formal_address = max(local_interv[0][0 : beg_following_intervs - 5].rfind('Sr.'), \
                    local_interv[0][0 : beg_following_intervs - 5].rfind('S.'), \
                    local_interv[0][0 : beg_following_intervs - 5].rfind('Sra.'), \
                    local_interv[0][0 : beg_following_intervs + 3].rfind('Orador'), 0)
                
                stop = local_interv[0][0 : formal_address].rfind('.')                
                
                end_current_interv = max(excl, intr, retic, stop, 0)

                current_interv = [local_interv[0][0 : end_current_interv]]

                interventions_corrected.append(current_interv)
                count_interv += 1

                local_interv = [local_interv[0][end_current_interv : ]]

    return interventions_corrected

In [5]:
# Aqui, eu tento corrigir classify_text, para que 
# nao seja tao strict a encontrar os oradores,
# (substituir interv[0][0:5] por "autor" in interv[0][:beg])


def classify_text(lst):
    """Takes a list of individual interventions in parliament
    and classifies them according to the political party
    to whom the MP belongs.
    """

    classified_speech = defaultdict(list)
    
    political_parties = {
            'PEV': 'Os Verdes',
            'Os Vedes': 'Os Verdes',
            'OS Verdes': 'Os Verdes',
            'Os verdes': 'Os Verdes',
            's Verdes': 'Os Verdes',
            'CDS-P': 'CDS-PP',
            'CSD-PP': 'CDS-PP',
            'CDS': 'CDS-PP',
            'B E': 'BE',
            'Bloco de Esquerda': 'BE',
            'PD': 'PSD',
            'N insc': 'N insc.',
            'CP': 'PCP',
            'CDS-': 'CDS-PP',
            ' PS': 'PS',
            'CDS.-PP': 'CDS-PP',
            'CD-PP': 'CDS-PP',
            'PDS': 'PSD',
            'os Verdes': 'Os Verdes',
            'PCPP': 'PCP',
            'N. insc.': 'N insc.',
            'N Insc': 'N insc.',
            'CDSPP': 'CDS-PP',
            'O Verdes': 'Os Verdes',
            'OPCP': 'PCP',
            'PCO': 'PCP',
            'CDP-PP': 'CDS-PP',
            'CDS-PPP': 'CDS-PP',
            'PCPC': 'PCP',
            'PSDF': 'PSD',
            'CDD-PP': 'CDS-PP',
            'CSDS-PP': 'CDS-PP',
            'Os Verde': 'Os Verdes',
            'OPS Verdes': 'Os Verdes',
            'Ninsc.': 'N insc.',
            'N Insc.': 'N insc.',
            'Livre': 'L',
            ' PCP': 'PCP',
            'CEDS-PP': 'CDS-PP',
            'PCVP': 'PCP',
            '. insc.': 'N insc.',
            'PC P': 'PCP',
            'CDS-PS': 'CDS-PP',
        
        }
    
    
    local_orador_tracker = 0
    local_party_tracker = defaultdict(str)
    for index, interv in enumerate(lst):

        intervention_start = interv[0].find(":") + 4
        
        # Includes ministro/a, primeiro-ministro, vice-primeiro-ministro-
        if "Ministr" in interv[0][0 : intervention_start]:
            classified_speech["Government"].append(interv[0][intervention_start:])
            local_party_tracker[index] = "Government"
            
        elif "ecretári" in interv[0][0 : intervention_start] and "Estado" in interv[0][0 : intervention_start]:
            classified_speech["Government"].append(interv[0][intervention_start:])
            local_party_tracker[index] = "Government"
        
        elif "Presidente" in interv[0][0 : intervention_start] or \
            "Secretári" in interv[0][0 : intervention_start]:
            # Aqui, quero também impedir a progressão do tracker quando não é novo orador.
            try:
                if (" a palavra" in interv[0] or ", a palavra" in interv[0]) and "Orador" not in lst[index + 1][0]:
                    local_orador_tracker = index
                else:
                    pass            

            except:
                if (" a palavra" in interv[0] or ", a palavra" in interv[0]):
                    local_orador_tracker = index
                else:
                    pass            
    

        # Se for a oradora, ir ver a political party ao início da declaraçao.
        elif "Orador" in interv[0][0 : intervention_start]:
            political_party = local_party_tracker[local_orador_tracker + 1]
            
            # Por vezes, são "Vozes" que se seguem.
            if political_party == '' and ("Vozes" in lst[local_orador_tracker + 1][0] or \
                "Protestos" in lst[local_orador_tracker + 1][0]):
                if "Vozes" in lst[local_orador_tracker + 2][0]  or "Protestos" in lst[local_orador_tracker + 2][0]:
                    political_party = local_party_tracker[local_orador_tracker + 3]
                if "Orador" in lst[local_orador_tracker + 2][0] \
                    and "Presidente" in lst[local_orador_tracker -1 ][0]:
                    political_party = local_party_tracker[local_orador_tracker - 1]
                else:
                    political_party = local_party_tracker[local_orador_tracker + 2]

            # Se ainda estiver vazio, será porque o orador se apresentou anteriormente.
            if political_party == '' and "Orador" in lst[local_orador_tracker + 1][0] \
                    and "Presidente" in lst[local_orador_tracker - 1]:
                political_party = local_party_tracker[local_orador_tracker - 1]


            # Here, I just correct for common mistakes:
            corrected_political_party = political_parties.get(political_party, political_party)
            classified_speech[corrected_political_party].append(interv[0][intervention_start:])


        elif "):" in interv[0][0 : intervention_start]:
            political_party_start = interv[0].find('(')
            political_party_end = interv[0].find(')')        
            political_party = interv[0][political_party_start + 1 : political_party_end]
            
            # Here, I just correct for common mistakes:
            corrected_political_party = political_parties.get(political_party, political_party)
            local_party_tracker[index] = corrected_political_party

            classified_speech[corrected_political_party].append(interv[0][intervention_start : ])
            

        else:
            pass
        
    return classified_speech

In [6]:
def get_interventions(file):
    """Opens given file and returns a dictionary in which
    keys are political parties, and values are a list of
    strings, with each string being an intervention in
    parliament by an MP of that political party.
    """

    raw = get_data(file)
    read = read_text(raw)
    correct = correct_interventions(read)
    classified_session = classify_text(correct)
    
    return classified_session

In [7]:
def process_time(lst):
    """Takes a list of strings and returns a string with the
    date of the Parliament session to which it corresponds.
    """

    date = ""
    counter = 0
    
    for line in lst:
        if line[0] == "":
            pass
        elif counter == 0:
            beg_long_date = re.search(r"\d", line[0]).start()
            end_long_date = line[0].find('de ', line[0].find('de ') + 1) + 7 
 
            if end_long_date > beg_long_date:
                long_date = line[0][beg_long_date : end_long_date]
                date = datetime.datetime.strptime(long_date, '%d de %B de %Y').strftime('%Y-%m-%d')



            # Check for errors - for example if string has no spaces
            elif end_long_date < beg_long_date:
                end_long_date = line[0].find('de20') + 6
                
                long_date = line[0][beg_long_date : end_long_date]
                date = datetime.datetime.strptime(long_date, '%dde%Bde%Y').strftime('%Y-%m-%d')
                
            counter += 1

        else:
            break
        
    return date

In [8]:
def get_time(file):
    """Opens given file and returns a string with the
    date of the Parliament session to which it corresponds.
    """

    raw = get_data(file)
    date = process_time(raw)
    
    return date