In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## What I want to do:

    1. Create a data frame of of parliament mebers' names and personal URLs.
    2. Open each URL to find their Twitter handle.
    3. Create a data frame with name, party and Twitter handle

# Locating member URLs on the website

In [2]:
# Problem: without clicking on a button, the parser doesn't capture the table of politicians
# Solution: click the button and save the html file to computer and open from there

fname = 'deputes_recherche-multicritere.html' #this html file is stored on the same folder of the code file
html_file = open(fname, 'r')
source_code = html_file.read()

In [3]:
soup = BeautifulSoup(source_code,'html.parser')

In [4]:
print(soup)

 <!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="fr-FR"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8" lang="fr-FR"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9" lang="fr-FR"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="fr-FR"> <!--<![endif]--> <head>
<meta charset="utf-8"/>
<title>Recherche multicritère - XVe législature - Assemblée nationale</title>
<meta content="text/html; charset=utf-8" name="Content-Type"/>
<meta content="fr-FR" name="Content-language"/>
<meta content="IE=edge,chrome=1" name="X-UA-Compatible"/>
<meta content="width=954" name="viewport"/>
<meta content="Assemblée Nationale" name="author"/>
<meta content="Assemblée Nationale" name="copyright"/>
<meta content="Presentation de l'Assemblee nationale, du palais Bourbon, de ses membres (deputes), de son fonctionnement et de son actualite : agenda, travaux en cours (amendements, rapports, commissions, lois), textes et dossi

In [5]:
# Getting the details into lists

mps = pd.DataFrame(columns=['Name', 'Surname', 'Party', 'URL'])
mpurls =[]
mpnames = []
mpsurnames = []
mpparties = []

for rows in soup.find_all('tr')[1:]: # skip the row with headers
    mpurls.append('https://www2.assemblee-nationale.fr' + rows.find_all('td')[0].find('a', href=True)['href'] + '#deputes-contact')
    mpsurnames.append(rows.find_all('td')[2].get_text().strip())
    mpnames.append(rows.find_all('td')[3].get_text().strip())

In [6]:
# Combining the lists into a dataframe

mps = pd.DataFrame(
    {'Name': mpnames,
     'Surname': mpsurnames,
     'URL': mpurls
    })

mps.sample(3) # does the dataframe look as it should

Unnamed: 0,Name,Surname,URL
193,Michel,Fanget,https://www2.assemblee-nationale.fr/deputes/fi...
133,François,Cornut-Gentille,https://www2.assemblee-nationale.fr/deputes/fi...
352,Emmanuel,Maquet,https://www2.assemblee-nationale.fr/deputes/fi...


In [7]:
mpurls[4] # are the URLs functional?

'https://www2.assemblee-nationale.fr/deputes/fiche/OMC_PA721036#deputes-contact'

In [8]:
mps.count() # Is the number of parliament members correct?

Name       570
Surname    570
URL        570
dtype: int64

In [9]:
www = "https://www2.assemblee-nationale.fr/deputes/fiche/OMC_PA721036#deputes-contact"
soup = BeautifulSoup(requests.get(www).text)
soup.select(".twitter")

[<a class="twitter topmargin" href="https://twitter.com/@LenaickADAM/">Consulter le compte Twitter de M. Lénaïck Adam <img src="/extension/ezswidl-pskn/design/ezswidl-pskn/images/icone-twitter.png"/></a>]

The problem could be solved by downloading personal pages: https://stackoverflow.com/questions/1825438/download-html-page-and-its-contents#1825465

# Locating twitter handle in personal pages

In [14]:
twitters = []
for www in mps['URL']:
    soup = BeautifulSoup(requests.get(www).text)
    
    try:
        twt = soup.find('a', attrs={'class':'twitter'}, href=True)['href']
    except:
        twt = ''
    twt = twt.replace("https://twitter.com/", "")
    twitters.append(twt)

mps['TwitterHandle'] = twitters

In [15]:
mps.sample(5)

Unnamed: 0,Name,Surname,URL,TwitterHandle
488,Laurent,Saint-Martin,https://www2.assemblee-nationale.fr/deputes/fi...,
118,Dino,Cinieri,https://www2.assemblee-nationale.fr/deputes/fi...,@DinoCinieri/
295,Jean-Luc,Lagleize,https://www2.assemblee-nationale.fr/deputes/fi...,@jeanluclagleize/
133,François,Cornut-Gentille,https://www2.assemblee-nationale.fr/deputes/fi...,
255,Alexandre,Holroyd,https://www2.assemblee-nationale.fr/deputes/fi...,@alexIholroyd/


# Cleaning up

In [18]:
# cleaning all the rubbish I found looking at the data

dirty_strings = ["/", "@"]

for dirt in dirty_strings:
    mps["TwitterHandle"] = mps["TwitterHandle"].str.replace(dirt, "")

In [20]:
mps.sample(5)

Unnamed: 0,Name,Surname,URL,TwitterHandle
554,Patrick,Vignal,https://www2.assemblee-nationale.fr/deputes/fi...,
291,Daniel,Labaronne,https://www2.assemblee-nationale.fr/deputes/fi...,LabaronneDaniel
273,Régis,Juanico,https://www2.assemblee-nationale.fr/deputes/fi...,juanico
320,Sandrine,Le Feur,https://www2.assemblee-nationale.fr/deputes/fi...,
147,Typhanie,Degois,https://www2.assemblee-nationale.fr/deputes/fi...,Typhanie_Degois


Testing with known rubbish handles

In [21]:
mps.to_csv("FrenchPols.csv")