In [1]:
import urllib.request
from lxml import etree
from bs4 import BeautifulSoup
import requests
import time
import re
import pickle

import os

import pandas as pd
import numpy as np

In [2]:
language_codes_list = ['BG',
                       'ES',
                       'CS',
                       'DA',
                       'DE',
                       'ET',
                       'EL',
                       'EN',
                       'FR',
                       'GA',
                       'HR',
                       'IT',
                       'LV',
                       'LT',
                       'HU',
                       'MT',
                       'NL',
                       'PL',
                       'PT',
                       'RO',
                       'SK',
                       'SL',
                       'FI',
                       'SV']

language_codes_dict = {'SV':'swe',
                       'FI':'fin',
                       'SL':'slv',
                       'SK':'slo',
                       'RO':'rum',
                       'PT':'por',
                       'PL':'pol',
                       'NL':'dut',
                       'MT':'mlt',
                       'HU':'hun',
                       'LT':'lit',
                       'LV':'lav',
                       'IT':'ita',
                       'HR':'hrv',
                       'GA':'gla',
                       'FR':'fra',
                       'EN':'eng',
                       'EL':'gre',
                       'ET':'est',
                       'DE':'ger',
                       'DA':'dan',
                       'CS':'cze',
                       'ES':'spa',
                       'BG':'bul'}

In [3]:
# this loop runs through all html files in English,
# looks at each agenda item (no spillovers except chair changes)
# extracts all <p>s in all <td>s
# for each <td> it grabs or remembers the speaker 
# for each <p> it grabs or remembers the language
# changes in the chair can happen at any time:
    # update chair and fix the name in a makeshift manner (I have the real name president/vice president list)
# The output will be: {session : [[agenda_title, full_speech, current_speaker, word_count, [<p>s], current_chair, [speech_language]] ... ]}
all_speeches_with_language_change = 0
output_data = {}



for file_language in language_codes_list:
    htmlfiles = os.listdir('./html_raw/{}'.format(file_language))
    print(len(htmlfiles))
    speeches_with_language_change = []
    count_files_processed = 0

    # for every html file
    for htmlfile in htmlfiles:
        count_files_processed += 1
        # give notice or break while experimenting
        if count_files_processed % 10 == 0: # <-------------------------
            print('{} Files processed. Continuing...'.format(count_files_processed))
            #break # <---------------------------

        # catches failures opening html
        try:
            with open('./html_raw/{}/{}'.format(file_language, htmlfile), 'r', encoding='utf8') as f:
                soup = BeautifulSoup(f, 'html.parser')
        except:
            print('Error opening html file!')
        else:

            session = htmlfile[2:-8]
            #print(session)
            # initialise as empty and 0
            current_chair = ''
            speech_items = []
            speech_number = 0


            #####################################################################
            # ... because no speech goes over an agenda_item border, start here #
            #####################################################################

            agenda_items = soup.findAll('table', attrs={'class':"doc_box_header"})
            for index, agenda_item in enumerate(agenda_items):
                
                # No spill over: so reset everything speech related
                speaker_name = ''
                group = ''
                agenda_title = ''

                # if agenda_item does not contain any speeches, then continue
                if (save_number_p := len(agenda_item.findAll('p'))) == 0:
                    continue
                    
                ##########################################
                # grab agenda_title or label as untitled #
                ##########################################
                agenda_title = agenda_item.find('td', attrs={'class':'doc_title'}).text
                if agenda_title == '':
                    agenda_title = 'untitled_agenda_block'


                ################################################################
                # Get all [<p>s] in all <td>s                                  #
                # <td>s are mostly only one speaker and mostly only one speech #
                # <p>s contain the spoken text AND background AND language     #
                ################################################################

                table_rows = [x.findAll('p', attrs={'class':'contents'}) for x in agenda_item.findAll('td', attrs={'align':'left', 'valign':'top'})]

                # This level is divided by a horizontal separator, but there ARE crossovers! 
                # This is not closed! 
                # Chair may change at any time
                # If there is no speaker, then the one from the last <td> continues
                # If there is a new cursive language code, then the language changes


                ####################################################
                # big loop through all table rows (= groupes <p>s) #
                ####################################################
                for p_group in table_rows:

                    # if there is not a single <p> in the table row then skip
                    if len(p_group) == 0:
                        continue

                    #####################
                    # Speaker and Chair #
                    #####################

                    first_bold = ''
                    # The name is always in the first p, no exception. It is also surrounded by span.bold, rare exceptions, can be ignored
                    first_bold = p_group[0].find('span', attrs={'class':'bold'})


                    # If there is a name, that's good. 
                    if first_bold != None:
                       speaker_name = first_bold.text.strip().replace('\xa0', ' ')                        

                    # We now have names and text, but sometimes we have text without a name.
                    # NOW: If there is no name, we have a problem. We have to know whether there should be a name!
                    elif first_bold == None:
                        # Only if p is a possible continuation it should be. 
                        if speaker_name != '':
                            continue
                        else:
                            speaker_name = 'unknown'


                    ############
                    # Language #
                    ############

                    speech_language = []

                    #######################################
                    # The big loop through all paragraphs #
                    #######################################
                    for p in p_group:

                        cursive = p.findAll('span', attrs={'class':'italic'})
                        # first case: no language info: best guess = file language
                        if not cursive and len(speech_language) == 0:
                            speech_language.append(language_codes_dict[file_language])
                            language_found = True
                        # second case: previous language info: best guess = previous language
                        elif not cursive and len(speech_language) != 0:
                            speech_language.append(speech_language[-1])
                            language_found = True
                        # third case: current language info
                        elif cursive: 
                            language_found = False
                            for number, candidate in enumerate(cursive):
                                if language_found == True:
                                    break
                                else:
                                    relevant = candidate.text
                                    # check if the text fits any known language
                                    for ind, code in enumerate(language_codes_list):
                                        regexsearch = '\({}\)'.format(code)
                                        if re.search(regexsearch, relevant):
                                            speech_language.append(language_codes_dict[code])
                                            language_found = True # if something has been found
                                            break
                                    # if there are candidates left, then do these first
                                    if number+1 < len(cursive):
                                        continue
                                    # if all candidates fail but previous language, then go with best guess
                                    if len(speech_language) != 0 and language_found == False:
                                        speech_language.append(speech_language[-1])
                                        language_found = True
                                    # if all candidates fail and no previous language, then go with best guess
                                    if len(speech_language) == 0 and language_found == False:
                                        speech_language.append(language_codes_dict[file_language])
                                        language_found = True

                    # remember all languages where the language changes mid-speech.
                    if len(set(speech_language)) != 1:
                        speeches_with_language_change.append([speech_language, session, p_group[0].text])
                        # 2008-09-22:
                        # (NL) I shall continue in Dutch
                        # 2008-09-23
                        # (NL) Mr President, terrorism was not invented on

                    ########
                    # Text #
                    ########
                    text = [p.text.replace('\xa0', ' ') for p in p_group]

                    ##########
                    # Output #
                    ##########
                    #{session : [[agenda_title, full_speech, speaker_name, p_count, word_count, chair, [speech_language], [<p>s]] ... ]}
                    full_speech = ''.join(text)
                    word_count = len(re.findall('\w+', full_speech))
                    chair = 'not_available' # current_chair.group(0).title().replace(':', '').strip().replace('\xa0', ' ')
                    speech_items.append([agenda_title, full_speech, speaker_name, len(text), word_count, chair, speech_language, text])


                    #print(speech_items)
                    #print('----------')

        #{session : [[agenda_title, full_speech, current_speaker, word_count, [<p>s], current_chair, [speech_language]] ... ]}
        output_data[session] = speech_items
        #print(session)
        #print(speech_items[0])
    
    print('Number of speeches with changing language:')
    print(len(speeches_with_language_change))
    all_speeches_with_language_change += len(speeches_with_language_change)
    
    with open('{}_unrefined'.format(file_language), 'wb') as f:
        pickle.dump(output_data, f, protocol=5)

print(all_speeches_with_language_change)

97
10 Files processed. Continuing...
20 Files processed. Continuing...
30 Files processed. Continuing...
40 Files processed. Continuing...
50 Files processed. Continuing...
60 Files processed. Continuing...
70 Files processed. Continuing...
80 Files processed. Continuing...
90 Files processed. Continuing...
Number of speeches with changing language:
139
97
10 Files processed. Continuing...
20 Files processed. Continuing...
30 Files processed. Continuing...
40 Files processed. Continuing...
50 Files processed. Continuing...
60 Files processed. Continuing...
70 Files processed. Continuing...
80 Files processed. Continuing...
90 Files processed. Continuing...
Number of speeches with changing language:
79
97
10 Files processed. Continuing...
20 Files processed. Continuing...
30 Files processed. Continuing...
40 Files processed. Continuing...
50 Files processed. Continuing...
60 Files processed. Continuing...
70 Files processed. Continuing...
80 Files processed. Continuing...
90 Files proce

In [None]:
for key, value in output_data.items():
    print(key)

In [None]:
len(output_data.keys())

In [None]:
output_data['2008-09-01']

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(output_data['2008-09-01'])
df

In [None]:
df.columns = ['agenda_item', 'text', 'speaker', 'number_p', 'number_w', 'chair', 'language_ps', 'ps']

In [None]:
df