# Scraping from FT

#### Import of needed packages

In [1]:
from bs4 import BeautifulSoup as bs
import re 
import os
import lxml
import xml.etree.ElementTree as ET
from lxml import objectify
from datetime import datetime
import numpy as np 
import pandas as pd 
import requests
import ftplib
from io import BytesIO
import tqdm

##### Creation of empty lists

In [2]:
# Empty lists for loading data
dirs = [] 
sub_dir = []
files = []
file_name = [] 
paths = []
raw_xml_list = []

# Empty lists for DataFrame creation (columns)
session = []
meetingnumber = []
date = []
location = []
group = []
content = []
path = []

#### Logon to ftp server and get a list of dictionaries containing transscripted meetings in Folketinget

In [3]:
ftp_url = "oda.ft.dk"
ftp_dir = "/ODAXML/Referat/samling/"

In [4]:
ftp = ftplib.FTP(ftp_url)
ftp.login("anonymous", "wpg345@alumni.ku.dk")
ftp.set_pasv(True)
ftp.cwd(ftp_dir)
ftp.dir(dirs.append)

#### Creates a list of sub-dir's

In [5]:
for i in range (0, len(dirs), 1):    # bygger en liste med underbiblioteker
    sub_dir.append(ftp_dir+dirs[i][-5:]+'/') 
    
for i in range (0, len(sub_dir), 1): # bygger en liste med filnavne
    ftp.cwd(sub_dir[i])
    ftp.dir(files.append)

for i in range(0, len(files)):       # extract filnavn
    file_name.append(files[i][39:])   

for i in range(0, len(file_name)):   # opbygning af den fulde sti og filnavn
    paths.append(ftp_dir+file_name[i][0:5]+'/'+file_name[i])

#### All transscripted meetings are transferred from ftp://oda.ft.dk and the content is written to raw_xml_list
* we use tqdm for monitoring the progress

# The following code reads all 1489 meeting resumes from ft.dk - don't run!

In [6]:
#for i in tqdm.tqdm(range(0, len(paths))):
#    r = BytesIO()
#    ftp.retrbinary(f"RETR {paths[i]}", r.write)
#    raw_xml_list.append(r.getvalue())
#    r.close()

# The following code reads 10 meeting resumes from ft.dk

In [7]:
for i in tqdm.tqdm(range(0, 10)):
    r = BytesIO()
    ftp.retrbinary(f"RETR {paths[i]}", r.write)
    raw_xml_list.append(r.getvalue())
    r.close()

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.37it/s]


# Define metadata function

In [8]:
def extract_metadata(xml):
    soup = bs(xml, 'lxml')
    
    d = soup.find("dateofsitting").text[:10]
    m = soup.find("meetingnumber").text
    s = soup.find("parliamentarysession").text
    g = soup.find("parliamentarygroup").text
    l = soup.find("location").text
    
    # antal gange de samme metadata gentages for hvert dagordenpunkt
    repeats = len(soup.find_all("dagsordenpunkt")) 
    
    for repeat in range(0, repeats):
        date.append(d)
        meetingnumber.append(m)
        session.append(s)
        group.append(g)
        location.append(l)
    
    return None

# Define content function 

In [9]:
def extract_content(xml): 
    soup = bs(xml, 'lxml')
    dagsordenpunkter = soup.find_all("dagsordenpunkt") # liste med dagsordenpunkter
    #print(f"Antal dagsordenpunkter: {len(dagsordenpunkter)}")
    
    for dp in dagsordenpunkter: # find raw xml-sætninger for hvert dagsordenpunkt
        sentences_raw = dp.find_all("linea") # liste med raw sætninger 
        #print(f"Antal sentences_raw: {len(sentences_raw)}")
        
        sentences_text = []
        
        for sr in sentences_raw: # for hver raw sætning, kør .text af sætningen
            sentences_text.append(sr.get_text(separator = ' ')) # gem clean sætninger til liste
        
        content.append(' '.join(sentences_text).strip()) # join, strip og append clean sætninger til liste
        #print(f"Antal content: {len(content)}")

    return None

# Define path function

In [10]:
# extract_path VIRKER
repeats = []

def extract_path(xml_list):
    
    for xml in xml_list: 
        soup = bs(xml, 'lxml')
        repeats.append(len(soup.find_all("dagsordenpunkt")))
    
    for i in range(0, len(xml_list)): 
        p = paths[i]
        path.extend([p] * repeats[i]) 
        #extend acts as append but multiplied by n times
    
    return None

Bemærk vi er interesseret i len(xml_list) modsat de næste funktioner, hvor det ikke behøver spille nogen rolle. Derfor vælger vi ikke at integrere extract_path ind i extract_metadata. 

# Full run

#### Extraction of meeting metadata

In [11]:
for xml in tqdm.tqdm(raw_xml_list): 
    extract_metadata(xml)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.36it/s]


#### Extraction of meeting content

In [12]:
for xml in tqdm.tqdm(raw_xml_list): 
    extract_content(xml)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.30it/s]


#### Extraction of path to original meeting resume 

In [13]:
extract_path(tqdm.tqdm(raw_xml_list))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.37it/s]


# Make DataFrame containing needed data for analysis

In [14]:
ft = pd.DataFrame([session, meetingnumber, date, location, group, content, path]).transpose()
ft.columns = ['session', 'meetingnumber', 'date', 'location', 'group', 'content', 'path']

# Changeing date column for text to date format 

In [None]:
ft['date'] = pd.to_datetime(ft['date'])

# Only keeping data where date >= 2012

In [None]:
ft = ft[ft['date'].dt.year >= 2012]

# Save DataFrame to ft.csv in your `local` working directory

In [15]:
# Don't run on GitHub
ft.to_csv('ft_sample.csv', index = False)

In [16]:
#ft

Unnamed: 0,session,meetingnumber,date,location,group,content,path
0,20091,100,2010-05-31,Folketingssalen,Folketinget,Meddelelser fra formanden Tredje næstformand ...,/ODAXML/Referat/samling/20091/20091_M100_helem...
1,20091,100,2010-05-31,Folketingssalen,Folketinget,Det første punkt på dagsordenen er: 1) 1. beha...,/ODAXML/Referat/samling/20091/20091_M100_helem...
2,20091,100,2010-05-31,Folketingssalen,Folketinget,Det næste punkt på dagsordenen er: 2) 1. behan...,/ODAXML/Referat/samling/20091/20091_M100_helem...
3,20091,100,2010-05-31,Folketingssalen,Folketinget,Det næste punkt på dagsordenen er: 3) 1. behan...,/ODAXML/Referat/samling/20091/20091_M100_helem...
4,20091,100,2010-05-31,Folketingssalen,Folketinget,Det næste punkt på dagsordenen er: 4) 1. behan...,/ODAXML/Referat/samling/20091/20091_M100_helem...
...,...,...,...,...,...,...,...
113,20091,11,2009-11-04,Folketingssalen,Folketinget,Meddelelser fra formanden Formanden Mødet er å...,/ODAXML/Referat/samling/20091/20091_M11_helemo...
114,20091,11,2009-11-04,Folketingssalen,Folketinget,Det første punkt på dagsordenen er: 1) Spørgsm...,/ODAXML/Referat/samling/20091/20091_M11_helemo...
115,20091,11,2009-11-04,Folketingssalen,Folketinget,Det næste punkt på dagsordenen er: \n \n 2) Be...,/ODAXML/Referat/samling/20091/20091_M11_helemo...
116,20091,11,2009-11-04,Folketingssalen,Folketinget,Det sidste punkt på dagsordenen er: 3) 1. beha...,/ODAXML/Referat/samling/20091/20091_M11_helemo...
