In [None]:
from math import prod
import requests
import urllib.request
from bs4 import BeautifulSoup

import json
import re

import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

from time import sleep
import datetime
import pathlib
import glob
import sys, os, traceback
import logging

from IPython.display import clear_output

In [None]:
def update_history_file(df, filename = 'temp.csv'):
    historical_file = pd.read_csv(filename)
    return pd.concat([historical_file, df], ignore_index=True)

In [None]:
def save_history_file(df, filename='temp'):
    df.to_csv(f'{filename}.csv', index=False)
    df.to_excel(f'{filename}.xlsx')

In [None]:
class vote:
    def __init__(self):
        pass

    def setValues(self, date, sitting, title, voteNumber, voted, agreed, against, stopped, notVoted, deputiesVotes):
        self.date = date
        self.sitting = sitting
        self.title = title
        self.voteNumber = voteNumber
        self.voted = voted
        self.agreed = agreed
        self.against = against
        self.stopped = stopped
        self.notVoted = notVoted
        self.deputiesVotes = deputiesVotes
    
    def getJson(self):
        return {
            'date' : self.date,
            'sitting' : self.sitting,
            'title' : self.title,
            'voteNumber': self.voteNumber,
            'voted' : self.voted,
            'agreed' : self.agreed,
            'against' : self.against,
            'stopped' : self.stopped,
            'notVoted' : self.notVoted,
            'deputiesVotes' : self.deputiesVotes
        }
    
    def setDate(self, date):
        self.date = date

    def setSitting(self, sitting):
        self.sitting = sitting

    def setTitle(self, title):
        self.title = title

    def setVoteNumber(self, voteNumber):
        self.voteNumber = voteNumber

    def setVoted(self, voted):
        self.voted = voted

    def setAgreed(self, agreed):
        self.agreed = agreed

    def setAgainst(self, against):
        self.against = against

    def setStopped(self, stopped):
        self.stopped = stopped

    def setNotVoted(self, notVoted):
        self.notVoted = notVoted

    def setDeputiesVotes(self, deputiesVotes):
        self.deputiesVotes = deputiesVotes

    def getUniqueRow(self):
        return (self.date.strftime('%Y-%m-%d'), self.sitting, self.title)

    def printInfo(self):
        print(self.date, self.sitting, "Nr: ",self.voteNumber, "Voted: {}, not voted: {}, total: {}".format(self.voted, self.notVoted, self.voted + self.notVoted))

In [None]:
class MainPage:
    def __init__(self, driver, url):
        self.driver = driver
        self.url = url
        self.main_table = None

    def open(self):
        self.driver.get(self.url)
        return self

    def set_main_table(self):
        self.main_table = self.driver.find_elements(By.TAG_NAME, 'tbody')[0]
        
    def get_all_votings_links(self):
        return self.main_table.find_elements(By.TAG_NAME, 'a')
    
    def get_all_votings(self):
        links_to_votings = self.get_all_votings_links()
        return dict(map(lambda x: (x.text, x.get_attribute('href')), links_to_votings))

    def get_votings_count(self):
        list_of_votings_count = self.main_table.find_elements(By.CLASS_NAME, 'right')
        return list(map(lambda x: x.text, list_of_votings_count))

    def quit(self):
        self.driver.close()

In [None]:
base_url = 'http://www.sejm.gov.pl/Sejm9.nsf/'
url = base_url + 'agent.xsp?symbol=posglos&NrKadencji=9'

In [None]:
options = webdriver.FirefoxOptions()
driver  = webdriver.Firefox(options=options)
main_page = MainPage(driver, url)
main_page.open()
main_page.set_main_table()

dict_of_links = main_page.get_all_votings()
list_of_votings_count = main_page.get_votings_count()

# Data transformation

In [None]:
# load data into dataframe
data = pd.DataFrame.from_dict(data = dict_of_links, orient='index', columns=['URL'])
data.reset_index(inplace=True)
data.rename(columns={'index':'Date'}, inplace=True)
data['Votings number'] = list_of_votings_count

In [None]:
# Change dates format
month_mapping = {
    'sierpnia':'08',
    'lipca':'07',
    'czerwca':'06',
    'maja':'05',
    'kwietnia':'04',
    'marca':'03',
    'lutego':'02',
    'stycznia':'01',
    'grudnia':'12',
    'listopada':'11',
    'października':'10',
    'września':'09',
}

data['Date'] = data['Date'].replace(month_mapping, regex = True) # Default to use with REGEX
data['Date'] = data['Date'].str.replace(' r.', '', regex = False) # Default to use without REGEX
data['Date'] = data['Date'].replace('\s','-', regex = True)
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

In [None]:
try:
    snapshotfile = pd.read_excel('temp.xlsx')
    snapshotfile.drop('Unnamed: 0', axis=1, inplace=True)
    snapshotfile['date'] = pd.to_datetime(snapshotfile['date'])
    snap_votings_count = snapshotfile['date'].value_counts().sort_index(ascending=False)
except ValueError:
    snapshotfile = pd.read_csv('temp.csv')
    snapshotfile['date'] = pd.to_datetime(snapshotfile['date'])
    snap_votings_count = snapshotfile['date'].value_counts().sort_index(ascending=False)

In [None]:
data = pd.merge(data, snap_votings_count, left_on='Date', right_index=True, how='left')
data['date'].fillna(0, inplace=True)
data.rename(columns={'date':'Votings number saved'}, inplace=True)
data['Votings number'] = data['Votings number'].astype(int)
data['Votings number saved'] = data['Votings number saved'].astype(int)

In [None]:
data_to_scrap = data[data['Votings number'] > data['Votings number saved']]

In [None]:
assert len(data_to_scrap) != 0

In [None]:
main_page.quit()

In [None]:
class VotesPage:
    def __init__(self, driver, url):
        self.driver = driver
        self.url = url
        self.header = None
        self.voting_topic_table = None
        self.historical_file = None
        self.votings = []

    def load_historical_votes(self, filename = 'temp.csv'):
        historical_file = pd.read_csv('temp.csv')
        columns = ['date', 'sitting', 'title']
        historical_file = historical_file[columns]
        historical_file = historical_file.drop_duplicates()
        self.historical_file = historical_file

    def open(self):
        self.driver.get(self.url)
        return self

    def set_header(self):
        self.header = self.driver.find_element(By.TAG_NAME, "h1").text.split(" ")

    def get_header_date(self):
        self.set_header()
        date = self.header[3].split("-")
        return datetime.date(int(date[2]), int(date[1]), int(date[0]))

    def get_header_sitting(self):
        self.set_header()
        return int(self.header[6][:-1])

    def set_voting_table(self):
        self.voting_topic_table = self.driver.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

    def get_votes_data(self):
        options = webdriver.FirefoxOptions()
        options.add_argument('-headless')

        self.set_voting_table()
        for row in self.voting_topic_table:
            voting = vote()
            voting.setDate(self.get_header_date())
            voting.setSitting(self.get_header_sitting())
            voting.setVoteNumber(int(row.find_element(By.CLASS_NAME, "bold").text))

            vote_row = row.find_elements(By.TAG_NAME, "td")[2]
            voting.setTitle(vote_row.text)

            vote_link = vote_row.find_element(By.TAG_NAME, 'a')

            if not self.historical_file.isin(voting.getUniqueRow()).all(1).any():
                sleep(0.5)
                driver_new_window  = webdriver.Firefox(options=options)
                driver_new_window.get(vote_link.get_attribute('href'))
                
                try:
                    all_cells = driver_new_window.find_element(By.TAG_NAME, "tbody").find_elements(By.CLASS_NAME, "left")
                    bold_elements = driver_new_window.find_element(By.CLASS_NAME, "sub-title").find_elements(By.TAG_NAME, "strong")

                    voted = int(bold_elements[0].text)
                    agreed = int(bold_elements[1].text)
                    against = int(bold_elements[2].text)
                    stopped = int(bold_elements[3].text)
                    notVoted = int(bold_elements[4].text)
                
                except:
                    voted = 0
                    agreed = 0
                    against = 0
                    stopped = 0
                    notVoted = 0
                
                voting.setVoted(voted)
                voting.setAgreed(agreed)
                voting.setAgainst(against)
                voting.setStopped(stopped)
                voting.setNotVoted(notVoted)
                voting.printInfo()

                current_names = []
                for cell in list(all_cells):
                    driver_votes  = webdriver.Firefox(options=options)
                    party_results_link = cell.find_element(By.TAG_NAME, "a")
                    driver_votes.get(party_results_link.get_attribute('href'))
                    sleep(0.5)
                    party_name = party_results_link.text
                    
                    all_results_cells = driver_votes.find_element(By.TAG_NAME, "tbody").find_elements(By.CLASS_NAME, "left")

                    for i in range(int(len(all_results_cells)/2)):
                        name = all_results_cells[i * 2].text
                        res = all_results_cells[(i * 2) + 1].text
                        current_names.append((name, party_name, res))
                        voting.setDeputiesVotes(current_names)
                        
                    driver_votes.close()
                    
                if len(current_names) == 0:
                    voting.setDeputiesVotes([('N/A','N/A','N/A')])

                self.votings.append(voting.getJson())
                driver_new_window.close()
            else:
                print('{}:{}. Skipped: {}'.format(*voting.getUniqueRow()))

    def quit(self):
        self.driver.close()

In [None]:
for new_url in data_to_scrap.URL.to_list():
    try:
        clear_output(wait=True)
        options = webdriver.FirefoxOptions()
        options.add_argument('-headless')
        driver  = webdriver.Firefox(options=options)

        newVotesPage = VotesPage(driver, new_url)
        newVotesPage.open()
        sleep(0.5)

        newVotesPage.load_historical_votes()
        newVotesPage.get_votes_data()
        newVotesPage.quit()

        merge_history_data_with_new_data = update_history_file(pd.json_normalize(newVotesPage.votings))
        save_history_file(merge_history_data_with_new_data)
    except Exception as e:
        print('Catched Exception')
        print(e)

    finally:
        print("Saving")
        merge_history_data_with_new_data = update_history_file(pd.json_normalize(newVotesPage.votings))
        save_history_file(merge_history_data_with_new_data)
        print("Saved")