# Scraping of votation outcomes

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os
import scipy.stats as stats
import matplotlib.pyplot as plt
import json
import re
import pickle
%matplotlib inline

store = False # store CSV tables for all votations

In [2]:
# GET GENERAL VOTATION INFO

r = requests.get('https://www.admin.ch/ch/f/pore/va/vab_2_2_4_1_2011_2020.html')
soup = BeautifulSoup(r.content, "lxml") #get the tree with beatifulsoup

In [3]:
# GET ALL VOTATION SESSIONS
# Get all votation sessions and its url and store in dictionary votation_sessions

votation_dates = soup.find_all('table')[0]
votation_sessions = {}
for dates in votation_dates.find_all('a', href=True):
    # date : dates.text
    # url : 'https://www.admin.ch/ch/f/pore/va/' + dates['href']
    votation_sessions[dates.text] = ['https://www.admin.ch/ch/f/pore/va/' +  dates['href'] ]

In [4]:
# GET INDIVIDUAL VOTATIONS FOR EACH SESSION

votations_code = [] # list with codes of all votations

for date in votation_sessions: # loop over all votation sessions
    url = votation_sessions[date]
    url_session = url[0][:-11]
    
    # Get is html content
    r = requests.get(url[0])
    soup_date = BeautifulSoup(r.content, "lxml")
    my_soup = soup_date.find_all('p')
    
    # Get individual votation codes for a given session
    votation_code = []
    for a in my_soup:
        url_list = a.find_all('a', href=True)
        if url_list != []:
            for url in url_list:
                my_url = url['href']
                if my_url[2] == 'c': 
                    votation_code.append(url_session + my_url[1:])
                    votations_code.append(my_url[5:-5])
                    
    # Append votation codes to dictionary votation_sessions together with the votation date and url
    votation_sessions[date].append(votation_code)
    
num_votations = len(votations_code) # counter that gives you the total number of votations

In [5]:
def scrapeTable(info):
    """ Create a pandas DataFrame with the html table
    info: beautiful soup file
    table: pandas data frame with scraped info"""
    
    # Define the names of the data frame fields
    field_names = []
    for field in info.find("thead").find_all('td'): # the first tr contains the field names
        field_names.append(field.text)
    table = pd.DataFrame(columns = field_names)

    # Populate data frame with data extracted from html table
    for index, col_label in enumerate(table.columns): # fill each data frame columns
        # in the html file, look for all data that correspond to that given column (Hint: they are always separated by len(table.columns))
        col_data = [data.text for i, data in enumerate(info.find("tbody").find_all('td')) if (i%len(table.columns) == index)]
        # fill the data frame
        table[col_label] = col_data

    return table

In [6]:
def storeCSVfile(date,all_tables):
    """Create tables in CSV format from data frame corresponding to the input date"""
    votation_names_text.write(date)
    votation_names_text.write('\n')
    
    # Create a folder for the given date
    date_short = [i for i in date if i !='.']
    date_short = ''.join(date_short)
    newpath = "Votation" + "\\" + date_short 
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    session_data = votation_sessions[date]
    bug_chars = ['’',' ' ,'\n','(',')' ,"'" ,"." ,",",'«','»','-']

    for votation in session_data[1]: # For each votation in the session
        session_code =votation[-11:-5]
        
        # Scrape the data from url
        session_r = requests.get(votation)
        session_soup = BeautifulSoup(session_r.content, "lxml")
        session_info = session_soup.find_all('table')[0]
        session_table = scrapeTable(session_info)
        votation_name = session_soup.find_all('h3')[0].text
        votation_name_df = pd.DataFrame(data = {'Votation Title': [votation_name]})
        
        votation_names_text.write(votation_name)
        votation_names_text.write('\n')
        
        # Define path for the given table
        votation_path = newpath + '\\' + session_code + '.xlsx'
        
        #all_tables.append(session_table)
        all_tables[int(session_code[3:])] = session_table
        
        # Store the data frame in CSV file in newpath location
        if store: 
            writer = pd.ExcelWriter(votation_path)
            session_table.to_excel(writer,sheet_name = 'Sheet1') # table is stored in Sheet 1
            votation_name_df.to_excel(writer,sheet_name = 'Sheet2') # votation name and description is stored in Sheet 2
            writer.save()
            
    return all_tables

Build large dataframe with all votation tables.

In [7]:
votation_names_text = open('votation_names.txt', 'w')
all_tables = {}
for date in votation_sessions:
    all_tables = storeCSVfile(date,all_tables)
votation_names_text.close()
# Note: we write the names of the votations in a text file for manual inspection and votation type definition

In [8]:
sub_columns = ['Canton', 'Electeurs', 'Votants', '% Particip.', 'Oui', 'Non', '% Oui', '% Non']

pds = []
for votation_code in all_tables:
    table = all_tables[votation_code]
    cols = pd.MultiIndex.from_product([votation_code, sub_columns])
    pds.append(pd.DataFrame(table.as_matrix(), columns=cols))

result = pd.concat(pds, axis=1)
result.head()

writer = pd.ExcelWriter('Votation\\all_votations.xlsx')
result.to_excel(writer,sheet_name = 'Sheet1') # table is stored in Sheet 1
writer.save()

### Assign votation catergories

In [9]:
# Assign votations to major votation topics (manually)
votation_subject = {'travail':[557,568,574,575,583,601],'environnement':[555, 556,566,569,577,578,588,591,595,599,602],'economie':[585,587,589,594,598],'immigration':[561,571,580,597,604],'education':[559,563,593],'securite':[554,572,582,584],'sante':[562,565,573,579,581,586,592,603],'social':[558,560,564,567,570,576,590,596,600]}

#       travail:  work, holidays
# environnement:  environment, agriculture, transport, energy
#   immigration:  foreigners affairs
#      securite:  police, military, justice
#         sante:  health
#        social:  family, retirement, elderly

In [10]:
votation_subject

{'economie': [585, 587, 589, 594, 598],
 'education': [559, 563, 593],
 'environnement': [555, 556, 566, 569, 577, 578, 588, 591, 595, 599, 602],
 'immigration': [561, 571, 580, 597, 604],
 'sante': [562, 565, 573, 579, 581, 586, 592, 603],
 'securite': [554, 572, 582, 584],
 'social': [558, 560, 564, 567, 570, 576, 590, 596, 600],
 'travail': [557, 568, 574, 575, 583, 601]}

In [11]:
# Save to pickle
pickle.dump(votation_subject,open( "votations_sorted.p", "wb" ))

# Load pickle
# votations_sorted = pickle.load( open( "votations_sorted.p", "rb" ) )