In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## What this code does:

    1. Create a data frame of of parliament members' names and personal URLs.
    2. Open each URL to find their Twitter handle.
    3. Create a data frame with name, country, party and Twitter handle

# Locating member URLs on the website

In [49]:
url = 'https://www.europarl.europa.eu/meps/en/full-list/all'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

response = requests.get(url, headers=headers)

In [50]:
soup = BeautifulSoup(response.content,'html.parser')

In [56]:
meps = pd.DataFrame(columns=['Name', 'Party', 'Country', 'URL'])
mepurls =[]
mepnames = []
mepparties = []
mepcountries = []
for a in soup.find_all('a', attrs={'class':'erpl_member-list-item-content'}, href=True):
    mepurls.append(a['href'])
    mepnames.append(a.find('div', attrs={'class': 'erpl_title-h5'}).get_text())
    mepparties.append(a.find_all('div', attrs={'class': 'sln-additional-info'})[1].get_text())
    mepcountries.append(a.find_all('div', attrs={'class': 'sln-additional-info'})[2].get_text())

meps = pd.DataFrame(
    {'Name': mepnames,
     'Party': mepparties,
     'Country': mepcountries
     'URL': mepurls
    })

In [57]:
meps.sample(5)

Unnamed: 0,Name,Party,Country,Role,URL
55,Leila CHAIBI,The Left,France,Substitute,https://www.europarl.europa.eu/meps/en/197529
28,Elżbieta Katarzyna ŁUKACIJEWSKA,PPE,Poland,Member,https://www.europarl.europa.eu/meps/en/96791
32,Tilly METZ,Verts/ALE,Luxembourg,Member,https://www.europarl.europa.eu/meps/en/193292
24,Elsi KATAINEN,Renew,Finland,Member,https://www.europarl.europa.eu/meps/en/191693
51,Petras AUŠTREVIČIUS,Renew,Lithuania,Substitute,https://www.europarl.europa.eu/meps/en/124766


In [58]:
print(a)

<a class="erpl_member-list-item-content t-y-block" href="https://www.europarl.europa.eu/meps/en/197405" itemprop="url">
<div>
<div class="erpl_image-frame mb-2">
<img aria-hidden="true" src="/commonFrontResources/evostrap/3.0.0/lib/assets/img/frame/portraitsize_thumb.png"/>
<span>
<picture>
<img alt="Jörgen WARBORN" loading="lazy" src="https://www.europarl.europa.eu/mepphoto/197405.jpg"/>
</picture>
</span>
</div>
<div class="erpl_title-h5 t-item">Jörgen WARBORN</div>
<div>
<div class="sln-additional-info mb-25">Substitute</div>
<div class="sln-additional-info mb-25">PPE</div>
<div class="sln-additional-info">Sweden</div>
</div>
</div>
</a>


In [59]:
meps.count()

Name       95
Party      95
Country    95
Role       95
URL        95
dtype: int64

In [60]:
# Test if it finds the Twitter URL

print(meps.loc[0, 'URL'])
response = requests.get(meps.loc[0, 'URL'])
soup = BeautifulSoup(response.content,'html.parser')
print(soup.find('a', attrs={'class':'link_twitt'}, href=True)['href'])

https://www.europarl.europa.eu/meps/en/96868
https://twitter.com/KarimaDelli


# Locating twitter handle in personal pages

In [61]:
twitters = []
for u in meps['URL']:
    quote_page = [u]
    for url in quote_page:
        response = requests.get(url)
    soup = BeautifulSoup(response.content,'html.parser')
    try:
        twt = soup.find('a', attrs={'class':'link_twitt'}, href=True)['href']
    except:
        twt = ''
    twt = twt.replace("https://twitter.com/", "")
    twitters.append(twt)   
meps['TwitterHandle'] = twitters

# Cleaning up

In [62]:
# cleaning all the rubbish I found looking at the data

dirty_strings = ["https://", "http://", "www.", "twitter.com", "lang=en", "lang=it", "lang=de", 
           "lang=fr", "ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor", "/status/456143806295855104", "?", "/","@"]

for dirt in dirty_strings:
    meps["TwitterHandle"] = meps["TwitterHandle"].str.replace(dirt, "")

  meps["TwitterHandle"] = meps["TwitterHandle"].str.replace(dirt, "")
  meps["TwitterHandle"] = meps["TwitterHandle"].str.replace(dirt, "")


In [63]:
meps.sample(5)

Unnamed: 0,Name,Party,Country,Role,URL,TwitterHandle
53,Tom BERENDSEN,PPE,Netherlands,Substitute,https://www.europarl.europa.eu/meps/en/197778,tbwberendsen
59,Nicola DANTI,Renew,Italy,Substitute,https://www.europarl.europa.eu/meps/en/124821,DantiNicola
25,Kateřina KONEČNÁ,The Left,Czechia,Member,https://www.europarl.europa.eu/meps/en/23699,
29,Peter LUNDGREN,ECR,Sweden,Member,https://www.europarl.europa.eu/meps/en/124996,
75,Colm MARKEY,PPE,Ireland,Substitute,https://www.europarl.europa.eu/meps/en/209896,colmmarkey


Testing with known flawed handles

In [79]:
meps.loc[191, "TwitterHandle"]

'tfajon'

In [80]:
meps.loc[84, "TwitterHandle"]

'paoloborchia'

In [64]:
meps.to_csv("mepstweet.csv")