# Get pictures for all MPs

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

### Read in details of members of congress

In [2]:
members = pd.read_hdf("list_of_members.h5", "members")

In [3]:
members

Unnamed: 0,dob,gender,caucus,class,district,term_end,bioguide_id,thomas_id,wikidata_id,id.wikipedia,first_name,last_name,full_name,party,rss_url,term_start,state,state_rank,type,url
4,1847-07-29,M,,2.0,,1919-11-12,M000200,,Q1168633,Thomas S. Martin,Thomas,Martin,,Democrat,,1919-05-19,VA,,sen,
1,1866-05-17,M,,,-1.0,1917-03-03,A000013,,Q1702161,John Abercrombie (Congressman),John,Abercrombie,,Democrat,,1915-12-06,AL,,rep,
4,1864-12-22,M,,,8.0,1917-03-03,A000027,,Q776226,John A. M. Adair,John,Adair,,Democrat,,1915-12-06,IN,,rep,
6,1863-12-14,M,,,3.0,1917-03-03,A000064,,Q2595560,Wyatt Aiken,Wyatt,Aiken,,Democrat,,1915-12-06,SC,,rep,
2,1867-07-23,M,,,2.0,1917-03-03,A000111,,Q194779,Alfred G. Allen,Alfred,Allen,,Democrat,,1915-12-06,OH,,rep,
1,1855-01-08,M,,,19.0,1917-03-03,B000050,,Q7970610,Warren Worth Bailey,Warren,Bailey,,Democrat,,1915-12-06,PA,,rep,
5,1863-05-18,M,,,32.0,1917-03-03,B000133,,Q4757452,Andrew Jackson Barchfeld,Andrew,Barchfeld,,Republican,,1915-12-06,PA,,rep,
0,1877-12-16,M,,,20.0,1917-03-03,B000269,,Q5201163,Cyrus William Beales,Cyrus,Beales,,Republican,,1915-12-06,PA,,rep,
0,1869-08-05,M,,3.0,,1921-03-03,B000293,,Q741918,J. C. W. Beckham,John,Beckham,,Democrat,,1915-12-06,KY,,sen,
0,1878-02-20,M,,,10.0,1917-03-03,B000359,,Q1410588,Henry S. Benedict,Henry,Benedict,,Republican,,1915-12-06,CA,,rep,


### Subset of members that we want to download images of include:
* All women house reps
* All male house reps from 1994 onwards

In [44]:
member_photos = members.loc[(members.type == "rep") &
            (((members.gender == "M") & (members.term_end > "1994-01-01")) | (members.gender == "F"))][["bioguide_id"]].drop_duplicates()

### Get photos from unitedstates github
* All photos from here are Public Domain licensed

In [55]:
def get_member_photo(member_bioguide):
    import urllib.request
    import os
    image_path = "./member_photos/{0}.jpg".format(member_bioguide)
    # If image doesn't yet exist
    if not os.path.isfile(image_path):
        try:
            urllib.request.urlretrieve("https://theunitedstates.io/images/congress/original/{0}.jpg".format(member_bioguide),
            image_path)
        except urllib.request.HTTPError:
            print("{0} ❌".format(member_bioguide))
            return None
    print("{0} ✔️".format(member_bioguide))
    return image_path

In [59]:
from multiprocessing import Pool

with Pool(16) as pool:
    member_photos["image"] = list(pool.map(get_member_photo, member_photos["bioguide_id"].tolist()))

H000900 ❌
M000207 ❌
V000112 ❌
G000407 ❌
G000225 ❌
C000107 ❌
O000033 ❌
M000481 ❌
K000262 ❌
B000561 ❌
R000318 ❌
R000252 ❌
C000345 ❌
G000272 ❌
R000055 ❌
N000125 ❌
K000312 ❌
C000389 ❌
C000488 ❌
H000114 ❌
G000368 ❌
K000328 ❌
H000448 ❌
L000073 ❌
L000098 ❌
N000136 ❌
M000372 ❌
L000045 ❌
O000061 ❌
E000217 ❌
O000154 ❌
F000110 ❌
N000016 ❌
C000178 ❌
F000111 ❌
C000767 ❌
M000684 ❌
R000417 ❌
P000215 ❌
V000124 ❌
M000669 ❌
G000471 ❌
F000187 ❌
D000051 ❌
S000109 ❌
P000328 ❌
M000795 ❌
F000261 ❌
D000267 ❌
C000722 ❌
S000125 ❌
R000072 ❌
P000287 ❌
S000462 ❌
H001015 ❌
R000243 ❌
R000109 ❌
J000149 ❌
S000587 ❌
R000458 ❌
K000110 ❌
S000607 ❌
B001158 ❌
K000118 ❌
P000505 ❌
B000592 ❌
K000181 ❌
W000634 ❌
M000195 ❌
C000467 ❌
F000418 ❌
S000014 ❌
G000408 ❌
H000174 ❌
S000798 ❌
P000005 ❌
S000136 ❌
E000064 ❌
W000068 ❌
A000018 ❌
D000046 ❌
F000348 ❌
D000452 ❌
S000622 ❌
E000184 ❌
P000171 ❌
W000139 ❌
L000497 ❌
E000186 ❌
B001220 ❌
P000326 ❌
L000225 ❌
R000259 ❌
R000481 ❌
F000128 ❌
H000788 ❌
P000444 ❌
L000267 ❌
W000256 ❌
S000056 ❌


In [62]:
member_photos

Unnamed: 0,bioguide_id,image
0,H000900,
0,R000318,
0,N000125,
0,L000073,
0,M000372,
0,O000061,
0,E000217,
0,O000154,
0,P000505,
0,W000634,


### For all reps that don't have photos, get them from wikidata

In [73]:
# Merge with wikidata ids
member_photos = member_photos.merge(members[["bioguide_id", "wikidata_id"]].drop_duplicates(), how="inner")

In [74]:
member_photos.head()

Unnamed: 0,bioguide_id,image,wikidata_id
0,H000900,,Q8025679
1,R000318,,Q528513
2,N000125,,Q571792
3,L000073,,Q1736243
4,M000372,,Q2177707


In [82]:
def get_image_from_wiki(member_wikidata):
    """When supplied with a wikidata entity url, check if entity has an image associated
    and download that image to ./member_photos/wiki_entityid.ext"""
    
    import requests
    import os
    import glob
    import urllib
    
    # First check if file already exists in ./mp_photos/
    if len(glob.glob("./member_photos/wiki_{0}.*".format(member_wikidata))) > 0:
        print("{0} ✔️".format(member_wikidata))
        return glob.glob("./member_photos/wiki_{0}.*".format(member_wikidata))[0]
    else:
        try:
            photo_url = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={0}&property=P18&format=json".format(member_wikidata))
            photo_url = photo_url.json()["claims"]["P18"][0]["mainsnak"]["datavalue"]["value"]
        
            media_url = requests.get("https://commons.wikimedia.org/w/api.php?action=query&prop=imageinfo&iiprop=url&titles=File:{0}&format=json".format(photo_url))
            media_url = list(media_url.json()["query"]["pages"].values())[0]["imageinfo"][0]["url"]

            # Finally, retrieve the image file, taking into account the correct file extension from photo_url
            filename = "./member_photos/wiki_{0}.{1}".format(member_wikidata, photo_url.split(".")[-1])
            urllib.request.urlretrieve(media_url, filename)
            
            print("{0} ✔️".format(member_wikidata))
            return filename
        except KeyError:
            print("{0} ❌".format(member_wikidata))
            return None
        

In [102]:
from multiprocessing import Pool
with Pool(8) as pool:
    member_photos.loc[member_photos.image.isnull(), "imgref_wiki"] = pool.map(get_image_from_wiki, member_photos.loc[member_photos.image.isnull()]["wikidata_id"].tolist())

Q529775 ✔️
Q1132872 ✔️
Q2527768 ✔️
Q8025679 ✔️
Q112622 ✔️
Q1251997 ✔️
Q1528172 ✔️
Q1252873 ✔️
Q461615 ✔️
Q178468 ✔️
Q528513 ✔️
Q6187332 ✔️
Q5644938 ✔️
Q1067999 ✔️
Q464724 ✔️
Q1689028 ✔️
Q1095185 ✔️
Q2146738 ✔️
Q571792 ✔️
Q577751 ✔️
Q2078827 ✔️
Q514368 ✔️
Q515131 ✔️
Q7920751 ✔️
Q512163 ✔️
Q1736243 ✔️
Q7922415 ✔️
Q538206 ✔️
Q1331285 ✔️
Q517866 ✔️
Q675570 ✔️
Q2177707 ✔️
Q1619314 ✔️
Q461657 ✔️
Q1304191 ✔️
Q2598113 ✔️
Q515269 ✔️
Q2439506 ✔️
Q1903065 ✔️
Q518515 ✔️
Q1181077 ✔️
Q6376235 ✔️
Q1052319 ✔️
Q868577 ✔️
Q370213 ✔️
Q5045141 ✔️
Q1717753 ✔️
Q1664399 ✔️
Q279351 ✔️
Q1138296 ✔️
Q232407 ✔️
Q1601865 ✔️
Q458322 ✔️
Q653258 ✔️
Q240965 ✔️
Q599174 ✔️
Q275753 ✔️
Q1776019 ✔️
Q1967257 ✔️
Q458473 ✔️
Q5340085 ✔️
Q3816306 ✔️
Q2279408 ✔️
Q1161029 ✔️
Q521721 ✔️
Q456750 ✔️
Q1249830 ✔️
Q2576584 ✔️
Q1296186 ✔️
Q5362988 ✔️
Q1529885 ✔️
Q113955 ✔️
Q8025372 ✔️
Q2045562 ✔️
Q1050952 ✔️
Q2655511 ✔️
Q6762003 ✔️
Q649176 ✔️
Q82423 ✔️
Q5372390 ✔️
Q519699 ✔️
Q1736542 ✔️
Q1927918 ✔️
Q1297019 ✔️
Q1264861 ✔️
Q456413 ✔️
Q51

In [134]:
member_photos = member_photos.set_index("bioguide_id")
# Add names
member_photos = member_photos.join(members.assign(full_name = members["first_name"] + " " + members["last_name"])[["bioguide_id", "full_name"]].drop_duplicates().set_index("bioguide_id"))
# Save to HDF
member_photos.to_hdf("member_photos.h5", "w")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['image', 'wikidata_id', 'imgref_wiki', 'img', 'crop', 'full_name']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


### Autodetect faces and do a square crop

In [1]:
import pandas as pd
%matplotlib inline

In [2]:
member_photos = pd.read_hdf("member_photos.h5")

In [3]:
# Use images from both github and wikidata
member_photos["img"] = member_photos["image"].fillna(member_photos.imgref_wiki)

In [4]:
def check_file(bioguide_id):
    import glob
    if len(glob.glob("./member_photos/cropped/{0}.*".format(bioguide_id))) > 0:
        return glob.glob("./member_photos/cropped/{0}.*".format(bioguide_id))[0]

In [5]:
member_photos["crop"] = list(map(check_file, member_photos.index.tolist()))

How many members are lacking photos?


In [5]:
member_photos.loc[member_photos["img"].isnull()]

Unnamed: 0_level_0,image,wikidata_id,imgref_wiki,img,crop,full_name
bioguide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
S001139,,Q1933553,,,,Michael Synar


In [7]:
member_photos

Unnamed: 0_level_0,image,wikidata_id,imgref_wiki,img,crop,full_name
bioguide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
H000900,,Q8025679,./member_photos/wiki_Q8025679.jpg,./member_photos/wiki_Q8025679.jpg,./member_photos/cropped/H000900.png,Winnifred Huck
R000318,,Q528513,./member_photos/wiki_Q528513.jpg,./member_photos/wiki_Q528513.jpg,./member_photos/cropped/R000318.png,Alice Robertson
N000125,,Q571792,./member_photos/wiki_Q571792.jpg,./member_photos/wiki_Q571792.jpg,,Mae Nolan
L000073,,Q1736243,./member_photos/wiki_Q1736243.jpg,./member_photos/wiki_Q1736243.jpg,,Katherine Langley
M000372,,Q2177707,./member_photos/wiki_Q2177707.jpg,./member_photos/wiki_Q2177707.jpg,,Ruth McCormick
O000061,,Q1304191,./member_photos/wiki_Q1304191.jpg,./member_photos/wiki_Q1304191.jpg,,Pearl Oldfield
E000217,,Q1664399,./member_photos/wiki_Q1664399.jpg,./member_photos/wiki_Q1664399.jpg,,Willa Eslick
O000154,,Q1776019,./member_photos/wiki_Q1776019.jpg,./member_photos/wiki_Q1776019.jpg,,Ruth Owen
P000505,,Q3816306,./member_photos/wiki_Q3816306.jpg,./member_photos/wiki_Q3816306.jpg,,Ruth Pratt
W000634,,Q1296186,./member_photos/wiki_Q1296186.jpg,./member_photos/wiki_Q1296186.jpg,,Effiegene Wingo


In [6]:
if "face_locations" not in member_photos.columns:
    member_photos["face_locations"] = None

In [22]:
def detect_face(bioguide_id, n=0, display=False):
    """Use face_recognition module to get square crop of face"""

    import face_recognition
    from PIL import Image
    import matplotlib.pyplot as plt
    from ipykernel.pylab.backend_inline import flush_figures

    img_src = member_photos.loc[bioguide_id]["img"]
    if img_src == None:
        # If we don't have an image, just end
        return None
    # save 
    img = face_recognition.load_image_file(img_src)
    if member_photos.loc[bioguide_id, "face_locations"] == None:
        face_locations = face_recognition.face_locations(img)
        member_photos.at[bioguide_id, "face_locations"] = face_locations
    else:
        face_locations = member_photos.loc[bioguide_id, "face_locations"]
    member_photos.loc[bioguide_id, "n_crops"] = len(face_locations)
    member_photos.loc[bioguide_id, "crop_id"] = n

    #n = dled_images.loc[bioguide_id, "crop_id"]
    try:
        top, right, bottom, left = face_locations[n]
        top -= int((bottom-top)*0.2)
        bottom -= int((bottom-top)*0.2)
        # min distance to edge of photo
        min_dist = min([top, img.shape[0] - bottom, left, img.shape[1]-right])
        # padding = the largest size square we can use
        padding = min(int((right-left) * 2), min_dist)
        # You can access the actual face itself like this:
        face_image = img[top - padding : bottom + padding, left - padding : right + padding]
    
        if display:
            fig = plt.figure()
            ax1 = fig.add_subplot(1,3,1)
            ax1.imshow(img)
            ax1.axis("off")

            ax2 = fig.add_subplot(1,3,2)
            ax2.imshow(face_image)
            ax2.axis("off")
        else:
            #pass
            im = Image.fromarray(face_image)
            image_path = "./member_photos/cropped/{0}.png".format(bioguide_id)
            im.save(image_path)
            member_photos.loc[bioguide_id, "crop"] = image_path
            return image_path
        
        # if this was successful, set
    except IndexError:
        if display:
            plt.imshow(img)
        
    flush_figures()
    #print(dled_images.loc[bioguide_id][["full_name", "crop_id", "n_crops", "crop_flag"]])
    print(member_photos.loc[bioguide_id])

In [15]:
# Crop all images automatically
from multiprocessing import Pool
with Pool(8) as pool:
    list(pool.map(detect_face, member_photos.loc[member_photos.crop.isnull()].index.tolist()))

image                                          None
wikidata_id                                Q2302283
imgref_wiki       ./member_photos/wiki_Q2302283.jpg
img               ./member_photos/wiki_Q2302283.jpg
crop                                           None
full_name                      Gillespie Montgomery
face_locations                                   []
n_crops                                           0
crop_id                                           0
Name: M000865, dtype: object
image                                          None
wikidata_id                                Q1664399
imgref_wiki       ./member_photos/wiki_Q1664399.jpg
img               ./member_photos/wiki_Q1664399.jpg
crop                                           None
full_name                              Willa Eslick
face_locations                                   []
n_crops                                           0
crop_id                                           0
Name: E000217, dtype: object
image 

In [23]:
from ipywidgets import interactive, fixed, widgets, interact
import matplotlib.pyplot as plt

bioguide_id_dropdown = widgets.Dropdown(options=dict(zip(member_photos.loc[member_photos.crop.isnull()]["full_name"].tolist(),
                                                      member_photos.loc[member_photos.crop.isnull()].index.tolist())))

n_range = widgets.IntSlider(min=0, max=5, value=0, continuous_update=False)
display_check = widgets.Checkbox(value=True)
interact(detect_face, bioguide_id=bioguide_id_dropdown,
         n=n_range, display=display_check)

<function __main__.detect_face>

In [21]:
def flag_img(flag_bool):
    member_photos.loc[bioguide_id_dropdown.value, "crop_flag"] = flag_bool
    plt.imshow(img)
    print(member_photos.loc[bioguide_id_dropdown.value][["full_name", "crop_flag"]])

flag_button = widgets.Checkbox(
    value=False,
    description='Flag image',
    icon='check'
)

interact(flag_img, flag_bool=flag_button)

<function __main__.flag_img>

In [37]:
member_photos

Unnamed: 0_level_0,image,wikidata_id,imgref_wiki,img,crop,full_name,face_locations,n_crops,crop_id,crop_flag
bioguide_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
H000900,,Q8025679,./member_photos/wiki_Q8025679.jpg,./member_photos/wiki_Q8025679.jpg,./member_photos/cropped/H000900.png,Winnifred Huck,,,,
R000318,,Q528513,./member_photos/wiki_Q528513.jpg,./member_photos/wiki_Q528513.jpg,./member_photos/cropped/R000318.png,Alice Robertson,,,,
N000125,,Q571792,./member_photos/wiki_Q571792.jpg,./member_photos/wiki_Q571792.jpg,./member_photos/cropped/N000125.png,Mae Nolan,"[(405, 1256, 1363, 298)]",1.0,0.0,
L000073,,Q1736243,./member_photos/wiki_Q1736243.jpg,./member_photos/wiki_Q1736243.jpg,./member_photos/cropped/L000073.png,Katherine Langley,"[(68, 151, 175, 44)]",1.0,0.0,
M000372,,Q2177707,./member_photos/wiki_Q2177707.jpg,./member_photos/wiki_Q2177707.jpg,./member_photos/cropped/M000372.png,Ruth McCormick,"[(116, 378, 223, 270)]",1.0,0.0,
O000061,,Q1304191,./member_photos/wiki_Q1304191.jpg,./member_photos/wiki_Q1304191.jpg,./member_photos/cropped/O000061.png,Pearl Oldfield,"[(167, 489, 390, 266)]",1.0,0.0,
E000217,,Q1664399,./member_photos/wiki_Q1664399.jpg,./member_photos/wiki_Q1664399.jpg,,Willa Eslick,[],0.0,0.0,
O000154,,Q1776019,./member_photos/wiki_Q1776019.jpg,./member_photos/wiki_Q1776019.jpg,./member_photos/cropped/O000154.png,Ruth Owen,"[(1436, 2540, 3092, 884)]",1.0,0.0,
P000505,,Q3816306,./member_photos/wiki_Q3816306.jpg,./member_photos/wiki_Q3816306.jpg,./member_photos/cropped/P000505.png,Ruth Pratt,"[(384, 1311, 705, 990)]",1.0,0.0,
W000634,,Q1296186,./member_photos/wiki_Q1296186.jpg,./member_photos/wiki_Q1296186.jpg,./member_photos/cropped/W000634.png,Effiegene Wingo,"[(869, 1313, 1668, 514)]",1.0,0.0,


In [510]:
dled_images.to_hdf("dled_images.h5", "w")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['First name', 'Last name', 'Party', 'Constituency', 'URI', 'full_name', 'clean_name', 'party_women', 'party_mysoc', 'mp_wikidata', 'mp_wikidata_id', 'party_wikidata', 'imgref', 'imgref_wiki', 'crop_flag', 'crop']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [29]:
mp_images = pd.read_hdf("mp_images.h5")

#### Let's create a base64 map for all mp ids

In [40]:
%%writefile mp_base64.py
#!/usr/bin/env python
def generate_base64(img_src):
    from PIL import Image
    from io import BytesIO
    import base64

    img = Image.open(img_src)
    img.thumbnail((16, 16))
    buffer = BytesIO()
    img.save(buffer, format="JPEG")
    return str(base64.b64encode(buffer.getvalue()), "utf-8")

# read all images and write a csv with base64 strings
import glob
import pandas as pd

images = pd.DataFrame({'src':glob.glob("./mp_photos/cropped/small/*.jpg")})
images["id"] = images["src"].apply(lambda x: int(x.split("-")[-1].split(".")[0]))
images["base64"] = list(map(generate_base64, images["src"].tolist()))

images[["id", "base64"]].to_csv("mp_base64.csv", index=False)

Writing mp_base64.py
