In [92]:
from bs4 import BeautifulSoup as bs
import re 
import os
import lxml
import xml.etree.ElementTree as ET
from lxml import objectify
from datetime import datetime
import numpy as np 
import pandas as pd 

### Prepare to load data

In [3]:
'''
with open(file) as f:
    parsed = objectify.parse(f)

root = parsed.getroot()
'''

'\nwith open(file) as f:\n    parsed = objectify.parse(f)\n\nroot = parsed.getroot()\n'

##### Read xml file 

In [4]:
file = "20091_M100_helemoedet.xml"
tree = ET.parse(file)
root = tree.getroot()

In [5]:
#root[0].tag
#root[1].tag

##### Subelements / children

In [6]:
# children of the root only 
root_children = []

for child in root:
    root_children.append((child.tag, child.attrib))

In [7]:
# All children in file
all_children = [elem.tag for elem in root.iter()]

In [8]:
unique_children = set(all_children)
print(unique_children)

{'MetaFTAgendaItem', 'EdixiDocLocation', 'Taler', 'ParliamentaryGroup', 'MetaSpeechSegment', 'LastModified', 'OratorLastName', 'Rubrica', 'ShortTitle', 'TaleType', 'ItemNo', 'DagsordenPunkt', 'TekstGruppe', 'GroupNameShort', 'TitelGruppe', 'EndDateTime', 'TaleSegment', 'OratorFirstName', 'TalerTitel', 'Exitus', 'UnderTitel', 'Titel', 'FTCaseStage', 'AudioFileFolder', 'ParliamentarySession', 'MetaMeeting', 'Location', 'Linea', 'DateOfSitting', 'FTCaseType', 'EdixiStatus', 'FTCaseNumber', 'StartDateTime', 'PunktTekst', 'Dokument', 'Char', 'MeetingNumber', 'FTCase', 'OratorRole', 'Aktivitet', 'MetaSpeakerMP', 'DagsordenPlan', 'Tale', 'PreTekst'}


In [9]:
# entire xml file as string
xml = ET.tostring(root, encoding='utf8').decode('utf8')

##### Soup

In [10]:
soup = bs(xml, 'lxml')

In [11]:
dagsordenpunkter = soup.find_all("dagsordenpunkt") # punkterne med tags 
len(dagsordenpunkter)

8

In [38]:
# dagsordenpunkter[1]

### Remove datetime tags

In [13]:
'''
for tag in soup.find_all('startdatetime'):
    tag.replace('')
    
for tag in soup.find_all('enddatetime'):
    tag.replace('')

for tag in soup.find_all('lastmodified'):
    tag.replace('')
    
for tag in soup.find_all('edixistatus'):
    tag.replace('')
'''

"\nfor tag in soup.find_all('startdatetime'):\n    tag.replace('')\n    \nfor tag in soup.find_all('enddatetime'):\n    tag.replace('')\n\nfor tag in soup.find_all('lastmodified'):\n    tag.replace('')\n    \nfor tag in soup.find_all('edixistatus'):\n    tag.replace('')\n"

In [14]:
'''
soup.find_all('startdatetime').decompose()
soup.find_all('enddatetime').decompose()
soup.find_all('lastmodified').decompose()
'''

"\nsoup.find_all('startdatetime').decompose()\nsoup.find_all('enddatetime').decompose()\nsoup.find_all('lastmodified').decompose()\n"

In [15]:
soup.startdatetime.clear()
soup.enddatetime.clear()
soup.lastmodified.clear()

### Dagsordenpunkter uden tags

In [16]:
punkter = []  # punkterne uden tags 

for dagsordenpunkt in range(0, len(dagsordenpunkter)):
    punkter.append(dagsordenpunkter[dagsordenpunkt].get_text(separator = ' '))

In [39]:
# punkter[1]

### Meta data

In [35]:
dato = bs(xml).find("dateofsitting").text[:10]
moedenr = bs(xml).find("meetingnumber").text
samling = bs(xml).find("parliamentarysession").text
hvem = bs(xml).find("parliamentarygroup").text
lokation = bs(xml).find("location").text

In [37]:
lokation

'Folketingssalen'

### List of filepaths

In [72]:
def getListOfFiles(dirName):
    listOfFile = os.listdir(dirName)
    allFiles = list()
    for entry in listOfFile:
        fullPath = os.path.join(dirName, entry)
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles 

#getListOfFiles('all')

In [78]:
path_list = getListOfFiles('sample')

In [83]:
path_list

['sample/20171_M11_helemoedet.xml',
 'sample/20211_M60_helemoedet.xml',
 'sample/20171_M46_helemoedet.xml']

# Extract metadata function

In [95]:
def extract_metadata(file): 
    tree = ET.parse(file)
    root = tree.getroot()
    xml = ET.tostring(root, encoding='utf8').decode('utf8')
    
    soup = bs(xml, 'lxml')
    
    d = soup.find("dateofsitting").text[:10]
    m = soup.find("meetingnumber").text
    s = soup.find("parliamentarysession").text
    g = soup.find("parliamentarygroup").text
    l = soup.find("location").text
    
    date.append(d)
    meetingnumber.append(m)
    session.append(s)
    group.append(g)
    location.append(l)

In [96]:
# empty lists (columns)
date = []
content = []
meetingnumber = []
session = []
group = []
location = []

# Content function 

In [None]:
dagsordenpunkter = soup.find_all("dagsordenpunkt") # punkterne med tags 
len(dagsordenpunkter)

# Apply metadata function to files

In [97]:
for path in path_list: 
    extract_metadata(path)

In [99]:
session

['20171', '20211', '20171']

# Make DataFrame

In [94]:
df = pd.DataFrame([date, content, meetingnumber, session, group, location]).transpose()
df.columns = ['date', 'content', 'meetingnumber', 'session', 'group', 'location']
df

Unnamed: 0,date,content,meetingnumber,session,group,location
0,2017-10-27,,11,20171,Folketinget,Folketingssalen
1,2022-02-10,,60,20211,Folketinget,Folketingssalen
2,2018-01-18,,46,20171,Folketinget,Folketingssalen
