In [None]:
from bs4 import BeautifulSoup as bs #For inspecting html webpage in notebook
import pandas as pd #To put data into frames for joining into a final result, also sued for printing to csv
import lxml #For parsing html
import requests #For requesting the webpages which we will srape
import time #To have a wait timer when scraping, for  politeness sake

In [None]:
#For this project we will start with the wikipedia page detailing the current (2020-06-21) list of US congress members.
#From this base page we can get the representative from each congress district together with data about their party affiliation
#previous experience, education, when they assumed their current office, residence, and which year they were born.

url = "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives" #Url to wikipedia page
response = requests.get(url) #The received page when requesting the specified url
soup = bs(response.content, 'lxml') #creating a BeautifulSoup object which we can display in the notebook and inspect

In [None]:
print(soup.prettify()) #Print the parsed html page

In [None]:
tables = soup.find_all('table') #Returns all tables on the webpage
tables #Print all tables in jupyter

In [None]:
#Returns the tables where you can sort the data on the webpage.
members_table = soup.find_all("table", class_ ="wikitable sortable")[2] #The webpage which we are interested in
print(members_table.prettify()) #Print the table of interest 

In [None]:
#Pandas has built in function to instantly scrape the wikipedia table and put the information into a pandas frame.
congress_members_frame = pd.read_html("https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives")[6] #Index specifies which table to put into a fram

In [None]:
congress_members_frame #Print the created data frame

In [None]:
#Now we will go through the table of congress members and scrape the links to their wikipedia pages
links_to_members = [] #list to store links
for row in members_table.findAll('tr'): #find all rows
    cells=row.findAll('td') #find all columns
    if len(cells)==9: #the number of columns in the table of interest is 9
        links = cells[1].findAll('a') #By inspecting the parsed html side we can see that links are started with an a hence we want to find all links in the second column
        if links != []: #Make sure that there is a link, vacancies have no links for example 
            link = links[1].get('href') #Since the table has a link to an image of the congress member before the link to their page we need to chose the second link
            links_to_members.append('https://en.wikipedia.org' + link) #Add the unique link to the list  
        else: 
            continue #If no link is found continue to next row

In [None]:
#Use the list created above to visit each members page and extract the name of their spouse, if any, and number of childre, if any.
#Names are scraped to get a unique key for later joining.
names = [] #List to keep the names used as keys.
spouses = [] #List to keep name of spouses
childrens = [] #List to keep number of childrens
for member in range(len(links_to_members)):
    #Set the three items of interest to a base case, in case we don't find the data we want we don't want to save the data from the previous
    #candidata again.
    cname = " "
    bname = " "
    spouse = "none"
    children = " "
    url = links_to_members[member] #link to specific member
    resp = requests.get(url, params={'action': 'raw'}) #request the page as raw wikidata page for easy of scrapeing the info box
    page = resp.text
    for line in page.splitlines(): #go through each line
        #We are looking for names which might most likely be under birth_name, name, or Name with either a white space after the '|' or no whitespace. 
        if line.startswith('| birth_name'):
            bname = line.partition('=')[-1].strip()
        elif line.startswith('|birth_name'):
            bname = line.partition('=')[-1].strip()
        elif line.startswith('|name'):
            cname = line.partition('=')[-1].strip()
        elif line.startswith('| name'):
            cname = line.partition('=')[-1].strip()
        elif line.startswith('|Name'):
            cname = line.partition('=')[-1].strip()
        elif line.startswith('| Name'):
            cname = line.partition('=')[-1].strip()
        #Spouse are most likelt found under spouse or Spouse
        elif line.startswith('|spouse'):
            spouse = line.partition('=')[-1].strip()
        elif line.startswith('|Spouse'):
            spouse = line.partition('=')[-1].strip()
        elif line.startswith('| Spouse'):
            spouse = line.partition('=')[-1].strip()
        elif line.startswith('| spouse'):
            spouse = line.partition('=')[-1].strip()
        #number of childrens might be udner children, Children, childrens, or Childrens
        elif line.startswith('| children'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('| Children'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('|children'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('|Children'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('|Childrens'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('| Childrens'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('| childrens'):
            children = line.partition('=')[-1].strip()
        elif line.startswith('| childrens'):
            children = line.partition('=')[-1].strip()
        #Website appears to be the last part of the infobox so when we reach it we stop scan their page.
        elif line.startswith('|website'):  
            break 
        elif line.startswith('| website'):  
            break
    if cname != " ": #We will prefere their called name which should correspond better between tables
        name = cname
    elif bname != " ": #If we only find their birth name we will use that instead to make manual pairing easier when cleaning data
        name = bname 
    else: #If we do not find any name we wil lfill it in as blank
        name = " "
    names.append(name) #Add the name to the list
    spouses.append(spouse) #Add the name of the spouse to the list
    childrens.append(children) #Add the number of childrens to the list
    time.sleep(0.5) #Wait this time to be polite


In [None]:
member_personal_data = pd.DataFrame(names,columns=['Member'])  #Put the new data into a frame with first column being member.
member_personal_data['Spouse'] = spouses
member_personal_data['Childrens'] = childrens

In [None]:
#Join the two tables using the member name as the key. In this case a full outer join will be used
#in order to include data which we fail to find the correct keys, e.g. one of frame might have th name Joe while another has the name Joseph.
#Another alternative would be do join on the position in the frames however the vacancies will mess up this ordering so we would need to place these last, or first.
result = pd.merge(congress_members_frame, member_personal_data,how='outer', on='Member')
result.to_csv('congress_members.csv') #Print the results to a csv file.

In [None]:
result #Show the result

In [None]:
url ="https://en.wikipedia.org/wiki/Liz_Cheney"
resp = requests.get(url, params={'action': 'raw'}) #request the page as raw wikidata page for easy of scrapeing the info box
page = resp.text
print(page)