In [None]:
import requests
import pandas as pd
import time
import json
from collections import namedtuple
from bs4 import BeautifulSoup
import lxml.html
import glob

## Scrap members

In [None]:
MEP_Member = namedtuple('MEP_member', ['name', 'group', 'link','country','party','status'])

In [None]:
def scrap_data_MEP_members(html_string):
    soup = BeautifulSoup(html_string, 'lxml') # Parse the HTML as a string
    
    table = soup.find_all('table')[0] # Grab the first table

    rows = table.find_all('tr')
    rows = [r for r in rows if len(r)>0]
    rows = rows[1:]
    
    all_members = []
    for r in rows:
        try:
        

            link =r.find('a')['href']
            name = r.find_all("td")[0].text.strip()
            group = r.find_all("td")[1].text.strip()
            country = r.find_all("td")[2].text.strip()
            party = r.find_all("td")[3].text.strip()
            status = r.find_all("td")[4].text.strip()

            all_members.append(MEP_Member(name,group,link,country,party,status))
        except:
            print(r)

    return all_members


In [None]:
result = requests.get("https://www.votewatch.eu//en/term8-european-parliament-members.html?limit=1000")
s =scrap_data_MEP_members(result.text)
df = pd.DataFrame(s)
df['id']=df.index+1
df[df.id==386]
df.to_csv('MEP_lists.csv',encoding='utf-8')

## Scrap votes

In [None]:
def scrap_votes(member_id):
    url_votes = f"https://www.votewatch.eu/actions.php?euro_parlamentar_id={str(member_id)}&form_category=get_mep_acte&sEcho=3&iColumns=6&sColumns=&iDisplayStart=0&iDisplayLength=20000"
    result = requests.get(url_votes).json()
    df_votes = pd.DataFrame(result['all_votes'])
    df_votes['loyalty']=df_votes.euro_vot_rol_euro_grup.apply(pd.Series)['rol_af']
    df_votes['member_id']=member_id
    return df_votes

In [None]:
all_votes = pd.DataFrame()

for member in df.id.tolist()[180:]:
    print(str(member))
    
    try:
        df_votes = scrap_votes(member)
        all_votes = pd.concat([all_votes,df_votes],axis=0)
    except:
        print(f'could not scrap votes for member {str(member)}')
    if member % 10 ==0:
        print(f'save point on {str(member)}')
        all_votes.to_csv(f'MEP_votes_{str(member)}.csv')
        all_votes = pd.DataFrame()  
        
all_votes.to_csv(f'MEP_votes_last.csv')

In [None]:
all_votes.to_csv(f'MEP_votes_last.csv')


## read all votes files

In [None]:
all_votes_files =  [f for f in glob.glob('MEP_votewatch/votes/*.csv', recursive=True)]

## Create votes domains table

In [None]:
df = pd.read_csv(all_votes_files[2])

In [None]:
df_domains = df[['euro_domeniu_id','euro_domeniu_nume']].drop_duplicates().sort_values('euro_domeniu_id').set_index('euro_domeniu_id')

In [None]:
df_domains.to_csv('MEP_votes_domains.csv')

## Create actes tables

In [None]:
df_actes = df[['act_nume_full','euro_act_id']].drop_duplicates()
df_actes['acte_title'] = df_actes.act_nume_full.str.extract('<a[^>]*>(.*?)</a>')
df_actes['acte_title'] = df_actes['acte_title'].str.replace('<b>','').str.replace('<[^<]+?>', '')
df_actes['link']= df_actes.act_nume_full.str.extract('href=[\'"]?([^\'" >]+)')
df_actes.to_csv('MEP_votes_actes.csv')

## Filter votes on id

In [None]:
def filter_votes_data(file):
    df = pd.read_csv(file)
    df_votes = df[['euro_act_id','euro_domeniu_id','euro_vot_valoare_text','mysql_data_text','loyalty','member_id']]
    return df_votes
    

In [None]:
df_all_votes_light = pd.DataFrame()

for f in all_votes_files:
    df_light_votes = filter_votes_data(f)
    df_all_votes_light = pd.concat([df_all_votes_light,df_light_votes],axis=0)
    
    

In [None]:
df_all_votes_light.to_csv('all_votes.csv')

## Votes definition

In [None]:
votes_definitions = {'M':'Documented Absence','N':'Did not vote',"-":"Against","+":"In favour","A":"Absent","0":"Abstained"}

In [None]:
df_votes_definitions = pd.DataFrame.from_dict(votes_definitions,orient='index').reset_index()
df_votes_definitions.columns = ['vote_id','vote_name']
df_votes_definitions.to_csv('MEP_votes_definitions.csv', index=False)