# Google Image Search Baseline Analysis for Hillary Clinton and Donald Trump Images

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 3)
plt.rcParams['font.family'] = 'sans-serif'

pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

## Scrape images and metadata from the saved web search source file

### Run `1_extract_images_urls.py` script 
* This script reads the htm source file saved from the Google Image search results page for the queries 'hillary clinton' or 'donald trump' for images from the 3rd September 2016 to the 28th October 2016.
* output will be:
    - meta data saved in `donald_trump_image_universe.csv` or `hillary_clinton_image_universe.csv`
    - Images saved in `donald_trump_image_universe` or `hillary_clinton_image_universe` directories

## Manually check images:
* Remove images that are not of the candidate
* Remove images where there are mulitple faces
* Manually collect images that failed to download properly. This is sometimes because more time was needed to scrape it

### Run `2_remove_rejected_from_data.py` script
* Removes rows from the `donald_trump_image_universe.csv` or `hillary_clinton_image_universe.csv` spreadsheets, so it correctly references the remaining images in the `donald_trump_image_universe` or `hillary_clinton_image_universe` image directories.

## Programatically identify unique images 
* Read in each image, calculate a unique hash based on the image data, a bit like a fingerprint, and store the hash along with the image filename in a shelf (a persistent dictionary). Many images were repeats, so there were many image filenames stored with the same hash key. 

* The code is based on this blog post: https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/ which I adapted to work with Python3

In [2]:
%run ../index.py --dataset ./hillary_clinton_image_universe --shelve hc_baseline.shelve

1d1a1a9e1e1b0d0f
8d4d4d4d49191b5a
68e4c4554d425e46
cd0d4d4d8e1b2b2b
09090d0d89898dcc
868b8f07078f2e2d
6a3a96868f9f1488
1606075d4e5d9d6f
decfc7c7cec6cecc
96b8869e8e8e8686
cacbcdc7c4c4c787
1b191d1e121b258d
92183818381c96c2
793333139312f362
33706c6c2c3c1a93
5b5b0f6f2f69484c
cb9e99999bdfbe8e
e66e6e6e6767e7c6
4f46666624ec8999
3434246574bca969
0d9d8d9d8d9b3f0d
4d6f6f5f9faf8b0f
0e4e4ece4e4fc989
1f1b1b1f1f1f1707
4c968e060e0d2d6c
0d0d8d8ccccaa3a7
96864e4e0f07e6f6
6d4c0fcc4c8c3636
0d4f0e8e2f2764d4
60664646464cc6c2
9b06c6cec6a7fbfb
050f0f0f070f6b5b
8b8a989e9e8ece46
7052d2d3d2d2b119
86ce5b4a8667454b
1f3b6969697b3232
acaeaaaaaca6a2a6
2b27032327273327
4c0e0e4d4d4c680c
cdcdcd4d664b4f13
af4f4f1b0d0d9f9e
246c8ccc4c6d4d18
db4dac6f6aa6674f
98383c3c3c3c2c8e
56271656243626b2
0fcf4f4fcfcf9b9f
1c3d3d35747676f7
07ce8e6c37683e3d
7f5f5b4f4f5bd2d2
1d4d4f474707cd8b
85868e8e9ad17131
9b2c3d3f9f9c5a18
0f1f4b1f9f1f3fcc
1e0d0e46c6462632
93ab6b6d4de9ed2b
4f0f1f1f0f4e0e1b
61890e6c4fe76b43
cd8d0d4f0e160636
4761c32c0d4e1a

In [3]:
%run ../index.py --dataset ./donald_trump_image_universe --shelve dt_baseline.shelve

4c8686864e4c4c0d
4ae5c0e47ebdba7a
59998a2869c9c9c8
474347434e6d5697
901d0d0e1d1dbedb
c4cbcc88cc6cc82b
cd8dcdcece4e4fcd
d43a3a3ab89c3d35
e0cc0e84ced695bd
8d8c4e0d07026a6a
1b0b3b3129694868
4d4d474f47059d0c
993b3333ab889c9e
4e9e8f86874b4667
0f0f4f4f4f4ccc8f
9d4c6652d8d89ada
01210d292c0f434b
4d5a595d1d1d0d2d
864e0627054f0d4d
3f0f4f1f2464646e
929a9a9a1e3c3232
4dcf8d4d230b0ecf
498d9d9d9c9b1f25
7171727ad8d8d859
cccece8e2b4b4f97
cfe3b3b2b23a3e1a
6717c77625ec6c6c
98989899091b1b5b
4e4d4e87474a4a4a
663ec6d8c8c8cc49
3e0e0d0d0d9b1d0c
4d4dcc4c4c9c4c4f
b18d4dcdcd8d8d0d
129e929696323dba
31342c2c29694ccc
31367252ded690b8
6c7471797c643430
1623232362764646
4f8f2b294ccf0766
460f0d0d494dcccc
34b63933b3337676
8d9d9f8b2b0b092e
2424246636363230
0d8eca0c6c24c4c6
347d5c7d6d3f3b3b
39d899195959751f
343a3b1b5adbf138
414347d3c92a28cc
cf4f93e06cce8ea9
3d3c2d2d2d6d6725
9098d8d9d84ccd4f
d3f0506064ececf2
07274fc6ceac9696
5327c76333371347
0f0f0f6d6d6d6cc4
d19233f26cac8d89
6561c4c441616a62
4d1e1e9e9b9b97cf
494d6d6d694ccc

## Add the hash to each row in our `hillary_clinton_remaining_images.csv` and `donald_trump_remaining_images.csv` dataframes

In [4]:
import shelve

In [5]:
# This is the same as script `3_baseline_imagesintoDF.py`

def add_hash_to_DF(shelf, data_csv, candidate):
    
    db = shelve.open(shelf + '.shelve')
    
    data = pd.read_csv(data_csv)

    for key in db.keys():   #  For every hash KEY
        for f in db[key]:   # for every file path name ITEM within the KEY
            for index, i in enumerate(data['image_file']):   # For every Image path in each row of my DF
                if f == i:                                   # If the ITEM file path is also in the IMAGE PATH of my DF
                    data.loc[index, 'image_hash'] = key      # Put the KEY into the 'image_hash' Column

    data.to_csv(candidate + "_baseline_hashedDF.csv", index=False)

    db.close()

In [6]:
add_hash_to_DF('hc_baseline', 'hillary_clinton_remaining_images.csv', 'HC' )

In [7]:
add_hash_to_DF('dt_baseline', 'donald_trump_remaining_images.csv', 'DT' )

## Read in Google Scraper search results table 

In [8]:
tmp = pd.read_csv('HC_baseline_hashedDF.csv', nrows=2)
tmp

Unnamed: 0.1,Unnamed: 0,candidate,count,image_file,image_url,news_source,news_source_url,present,image_hash
0,0,hillary_clinton,0,0_hillary_clinton.jpg,http://pixel.nymag.com/imgs/fashion/daily/2016...,New York Magazine,http://nymag.com/thecut/2016/09/hillary-clinto...,1,1d1a1a9e1e1b0d0f
1,1,hillary_clinton,1,1_hillary_clinton.jpg,http://i2.cdn.cnn.com/cnnnext/dam/assets/16101...,CNN.com,http://www.cnn.com/2016/12/21/politics/donald-...,1,efd3d3a9dbd2a1e1


In [9]:
cols = ['candidate', 'image_file', 'image_url', 'news_source', 'news_source_url', 'image_hash']

In [10]:
HC = pd.read_csv('HC_baseline_hashedDF.csv', usecols=cols)
DT = pd.read_csv('DT_baseline_hashedDF.csv', usecols=cols)
print(len(HC))
print(len(DT))

353
298


___
# NEWS SOURCE INFORMATION

In [11]:
HC.news_source.describe()

count                  353
unique                 144
top       Business Insider
freq                    16
Name: news_source, dtype: object

In [12]:
DT.news_source.describe()

count                  298
unique                 110
top       Business Insider
freq                    21
Name: news_source, dtype: object

In [13]:
HC.news_source_url.describe()

count                                                   353
unique                                                  330
top       http://www.harpersbazaar.com/celebrity/red-car...
freq                                                      5
Name: news_source_url, dtype: object

In [14]:
DT.news_source_url.describe()

count                                                   298
unique                                                  282
top       http://www.reviewjournal.com/opinion/editorial...
freq                                                      4
Name: news_source_url, dtype: object

## Getting Political Leaning from [Allsides](https://www.allsides.com) from News Sources of all images in basline dataset

![allsides](../logo-all-sides-medium.png) 
Allsides bias data was generously provided by [Allsides](https://www.allsides.com) 


In [15]:
def tag_bias_rating(candidate):
    
    candidate['bias_rating'] = 999
    allsides = pd.read_json('allsides_data.json')

    for i, valuei in enumerate(candidate.news_source):
        for j, valuej in enumerate(allsides.news_source):
            
            # this `if` line is necessary due to `st`, `t` and `s` being inexplicably stripped from 
            # the ends of news sources (eg Washington Po, The Federali). Best reason I can think of 
            # is that it happened during `extract_images_urls.py` since the meta data we strip "st:" 
            # meta_tags from the content of those tags...
            if valuei == valuej.strip('st') or valuei == valuej.strip('s') or valuei == valuej.strip('t'):                  
                    
                if allsides.loc[j, 'bias_rating'] == 71: # Left
                    candidate.loc[i, 'bias_rating'] = -2
                    
                elif allsides.loc[j, 'bias_rating'] == 72: # Lean left
                    candidate.loc[i, 'bias_rating'] = -1
                    
                elif allsides.loc[j, 'bias_rating'] == 73: # center
                    candidate.loc[i, 'bias_rating'] = 0
                    
                elif allsides.loc[j, 'bias_rating'] == 74: # lean right
                    candidate.loc[i, 'bias_rating'] = 1      
                    
                elif allsides.loc[j, 'bias_rating'] == 75: # Right
                    candidate.loc[i, 'bias_rating'] = 2                   
                   
                else:
                    candidate.loc[i, 'bias_rating'] = 999

In [16]:
tag_bias_rating(HC)
tag_bias_rating(DT)

In [17]:
HC.bias_rating.value_counts()

 999    227
 0       30
-1       29
-2       28
 1       23
 2       16
Name: bias_rating, dtype: int64

In [18]:
DT.bias_rating.value_counts()

 999    175
 0       43
-2       38
-1       31
 1        6
 2        5
Name: bias_rating, dtype: int64

In [19]:
HC[HC.bias_rating == 999].news_source.unique()

array(['CNN.com', 'The Huffington Po', 'Daily Expre',
       'Cincinnati Enquirer', 'People', 'The Denver Po', 'Reveli',
       'Us Weekly', 'CNBC.com', 'The New Yorker', 'Deadline', 'NJ.com',
       'Star Tribune', 'The Daily Bea', 'WBUR', 'Legal Insurrection',
       'Conservative Review', 'Psychology Today', 'The Telegraph',
       'The Forward', 'Investor\\u0027s Business Daily',
       'Encyclopedia Britannica', 'Snopes.com', 'CBS New', 'Radar Online',
       'The American Mirror', 'Slate', 'National Enquirer',
       'News \\u0026 Observer', 'Kansas City Star', 'Politicus USA',
       'Giphy', 'The Stranger', 'Zillow', 'ABC News - Go.com',
       'US.Blasting.New', 'Money Nation', 'MinnPo',
       'Harper\\u0027s Bazaar', 'Middle East Monitor',
       'The Political Insider', 'Funny Or Die', 'MarketWatch',
       'Al Jazeera', 'Natural New', 'BBC.com', 'The Inquisitr',
       'Hollywood Reporter', 'Miami Herald', 'New York Daily New',
       'The Independen', 'DeSmogBlog', 'Hilla

In [20]:
len(HC[HC.bias_rating == 999].news_source.unique())

115

In [21]:
DT[DT.bias_rating == 999].news_source.unique()

array(['Wikipedia', 'Fortune', 'BBC.com', 'TheWrap', 'ABC News - Go.com',
       'Right Wing Watch', 'Cargo', 'Esquire', 'The New York Time',
       'Slate', 'The New Yorker', 'Variety', 'NDTV.com', 'VICE New',
       'Hollywood Reporter', 'Conservapedia', 'Wikiquote',
       'National Catholic Reporter', 'CNN.com', 'CNN Money', 'CBN.com',
       'Sporting New', 'News Examiner', 'Attn', 'The Fader', 'WYFF.com',
       'PBS', 'US News \\u0026 World Repor', 'People', 'Snopes.com',
       'am New York', 'LifeNews.com', 'The Daily Bea', 'Detroit Free Pre',
       'Charlotte Observer', 'Al Jazeera', 'Encyclopedia Britannica',
       'Foreign Policy', 'Las Vegas Review-Journal', 'CBS New',
       'Big Think', 'Newsday', 'The Slot - Jezebel', 'City Page',
       'OpenInve', 'Us Weekly', 'CNBC.com', 'Alterne',
       'Wall Street Journal', 'Catholic News Agency', 'Miami Herald',
       'Sports Illustrated', 'The Boston Globe', 'Deadline', 'THE VOTER',
       'Natural New', 'Washington Blade', 

In [22]:
len(DT[DT.bias_rating == 999].news_source.unique())

83

## Supplement Allsides bias rating with data from a [Facebook political bias ratings study](http://science.sciencemag.org/content/early/2015/05/06/science.aaa1160)
Data is available [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/LDJ7MS)

In [23]:
facebook = pd.read_csv('../Facebook_study.csv')

In [24]:
facebook.head()

Unnamed: 0,p,avg_align,Unnamed: 2,Unnamed: 3,Unnamed: 4,Source,top_level_link,top_level_link_cleaned,avg_align.1,Total_articles
0,6abc.com,-0.5037,,,,The Hill,http://thehill.com/,thehill.com,0.1661,30.0
1,7online.com,-0.4067,,,,Politico,http://www.politico.com/,politico.com,-0.1334,27.0
2,aattp.org,-0.8936,,,,The Washington Post,https://www.washingtonpost.com,washingtonpost.com,-0.2568,25.0
3,abc11.com,-0.19,,,,The Huffington Post,http://www.huffingtonpost.com/,huffingtonpost.com,-0.6176,21.0
4,abc7.com,-0.31,,,,New York Times,http://www.nytimes.com/,nytimes.com,-0.5469,18.0


In [25]:
cols = ['p', 'avg_align']
facebook = pd.read_csv('../Facebook_study.csv', usecols=cols)
facebook.head()

Unnamed: 0,p,avg_align
0,6abc.com,-0.5037
1,7online.com,-0.4067
2,aattp.org,-0.8936
3,abc11.com,-0.19
4,abc7.com,-0.31


In [26]:
def tag_facebookbias_rating(candidate):
    
    candidate['facebook_p'] = ''
    candidate['facebookbias_rating'] = 999
    
    for i, valuei in enumerate(candidate.news_source_url):
        valuei = valuei.split('//')[1]
        valuei = valuei.split('/')[0]
        
        for j, valuej in enumerate(facebook.p):
            
            if valuej == valuei:                  
                print(valuei, valuej)
                candidate.loc[i, 'facebookbias_rating'] = facebook.loc[j, 'avg_align']
                candidate.loc[i, 'facebook_p'] = valuej

In [27]:
tag_facebookbias_rating(HC)
tag_facebookbias_rating(DT)

www.cnn.com www.cnn.com
thefederalist.com thefederalist.com
www.huffingtonpost.com www.huffingtonpost.com
www.salon.com www.salon.com
www.businessinsider.com www.businessinsider.com
www.businessinsider.com www.businessinsider.com
www.businessinsider.com www.businessinsider.com
www.cnn.com www.cnn.com
www.denverpost.com www.denverpost.com
www.cnn.com www.cnn.com
www.breitbart.com www.breitbart.com
www.businessinsider.com www.businessinsider.com
www.cnbc.com www.cnbc.com
www.newyorker.com www.newyorker.com
www.politico.com www.politico.com
www.nj.com www.nj.com
www.businessinsider.com www.businessinsider.com
www.thedailybeast.com www.thedailybeast.com
www.chicagotribune.com www.chicagotribune.com
thefederalist.com thefederalist.com
www.thegatewaypundit.com www.thegatewaypundit.com
legalinsurrection.com legalinsurrection.com
www.telegraph.co.uk www.telegraph.co.uk
www.breitbart.com www.breitbart.com
www.newyorker.com www.newyorker.com
www.npr.org www.npr.org
www.telegraph.co.uk www.telegr

In [28]:
def convert_facebookbias_toInts(col):
    
    if col >= 0.6 and col <= 1:
        return 2
    elif col >= 0.2 and col < 0.6:
        return 1
    elif col > -0.2 and col < 0.2:
        return 0
    elif col > -0.6 and col <= -0.2:
        return -1
    elif col <= -0.6:
        return -2
    elif col == 999:
        return 999
    else:
        return 999

In [29]:
HC['facebook_int'] = HC.facebookbias_rating.apply(convert_facebookbias_toInts)
DT['facebook_int'] = DT.facebookbias_rating.apply(convert_facebookbias_toInts)

In [30]:
HC.head()

Unnamed: 0,candidate,image_file,image_url,news_source,news_source_url,image_hash,bias_rating,facebook_p,facebookbias_rating,facebook_int
0,hillary_clinton,0_hillary_clinton.jpg,http://pixel.nymag.com/imgs/fashion/daily/2016...,New York Magazine,http://nymag.com/thecut/2016/09/hillary-clinto...,1d1a1a9e1e1b0d0f,-2,,999.0,999
1,hillary_clinton,1_hillary_clinton.jpg,http://i2.cdn.cnn.com/cnnnext/dam/assets/16101...,CNN.com,http://www.cnn.com/2016/12/21/politics/donald-...,efd3d3a9dbd2a1e1,999,www.cnn.com,-0.2705,-1
2,hillary_clinton,2_hillary_clinton.jpg,http://cdn.thefederalist.com/wp-content/upload...,The Federali,http://thefederalist.com/2016/10/20/the-five-m...,9f9d8f9ddf193656,1,thefederalist.com,0.8559,2
3,hillary_clinton,3_hillary_clinton.jpg,http://i.huffpost.com/gen/4727452/images/n-HIL...,The Huffington Po,http://www.huffingtonpost.com/isaac-saul/i-wro...,9819181899191c18,999,www.huffingtonpost.com,-0.6176,-2
4,hillary_clinton,4_hillary_clinton.jpg,http://media.salon.com/2016/10/hillary_clinton...,Salon,http://www.salon.com/2016/10/20/hillary-clinto...,4c0c0c4914363f3f,-2,www.salon.com,-0.8753,-2


In [31]:
def combine_ratings(candidate):
    
    candidate['combine_rating'] = ''
    
    for i, valuei in enumerate(candidate.bias_rating):
            
        # STATEMENTS FOR IF BOTH RATINGS AGREE:
        # Both bias ratings say LEFT
        if (valuei < 0) and (candidate.loc[i, 'facebook_int'] < 0):
            print(valuei, candidate.loc[i, 'facebook_int'],  "Left")
            candidate.loc[i, 'combine_rating'] = "Left"
          
        # Both bias ratings say CENTER
        elif (valuei == 0.0) and (candidate.loc[i, 'facebook_int'] == 0):
            print(valuei, candidate.loc[i, 'facebook_int'],  "Center")
            candidate.loc[i, 'combine_rating'] = "Center"
        
        # Both bias ratings say RIGHT
        elif (0 < valuei < 3) and (0 < candidate.loc[i, 'facebook_int'] < 3):
            print(valuei, candidate.loc[i, 'facebook_int'], "Right")
            candidate.loc[i, 'combine_rating'] = "Right"
        
        # STATEMENTS FOR IF RATINGS ARE ONLY PRESENT IN ONE (ALLSIDES OR FACEBOOK STUDY)
        # Only one scale has a rating of LEFT, while the other has no entry 
        elif (valuei < 0 and candidate.loc[i, 'facebook_int'] == 999) or (valuei == 999 and candidate.loc[i, 'facebook_int'] < 0):
            print(valuei, candidate.loc[i, 'facebook_int'],  "Left")
            candidate.loc[i, 'combine_rating'] = "Left"
            
        # Only one scale has a rating of CENTER, while the other has no entry 
        elif (valuei == 0 and candidate.loc[i, 'facebook_int'] == 999) or (valuei  == 999 and candidate.loc[i, 'facebook_int'] == 0):
            print(valuei, candidate.loc[i, 'facebook_int'],  "Center")
            candidate.loc[i, 'combine_rating'] = "Center"
        
        # Only one scale has a rating of RIGHT, while the other has no entry
        elif (0 < valuei < 3 and candidate.loc[i, 'facebook_int'] == 999) or (valuei  == 999 and 0 < candidate.loc[i, 'facebook_int'] < 3):
            print(valuei, candidate.loc[i, 'facebook_int'], "Right")
            candidate.loc[i, 'combine_rating'] = "Right"
             
        # ALL OTHER RATINGS ARE EITHER ABSENT FOR BOTH SCALES OR THE SCALES DISAGREE
        else:
            print(valuei, candidate.loc[i, 'facebook_int'], "Not Rated")
            candidate.loc[i, 'combine_rating'] = "Unknown / unreliable"

In [32]:
combine_ratings(HC)
combine_ratings(DT)

-2 999 Left
999 -1 Left
1 2 Right
999 -2 Left
-2 -2 Left
0 0 Center
0 0 Center
999 999 Not Rated
0 0 Center
999 999 Not Rated
999 -1 Left
999 999 Not Rated
999 -1 Left
999 -1 Left
2 2 Right
999 999 Not Rated
999 999 Not Rated
0 0 Center
999 0 Center
999 999 Not Rated
999 -2 Left
0 0 Center
999 999 Not Rated
999 -1 Left
999 999 Not Rated
0 0 Center
999 -1 Left
0 -1 Not Rated
999 999 Not Rated
1 2 Right
2 2 Right
999 2 Right
999 999 Not Rated
999 999 Not Rated
999 999 Not Rated
999 0 Center
2 2 Right
999 999 Not Rated
999 999 Not Rated
999 999 Not Rated
999 -2 Left
0 -2 Not Rated
999 0 Center
-2 -1 Left
999 999 Not Rated
999 0 Center
999 0 Center
2 2 Right
999 999 Not Rated
999 999 Not Rated
999 999 Not Rated
-1 -1 Left
-1 -1 Left
999 -2 Left
-2 -1 Left
999 -2 Left
-2 999 Left
999 999 Not Rated
2 2 Right
999 999 Not Rated
999 -1 Left
-2 -1 Left
999 999 Not Rated
999 999 Not Rated
-2 999 Left
999 -2 Left
999 999 Not Rated
999 999 Not Rated
999 999 Not Rated
-1 -1 Left
999 -1 Left
999 999 

In [33]:
HC.combine_rating.value_counts()

Unknown / unreliable    136
Left                    121
Center                   48
Right                    48
Name: combine_rating, dtype: int64

In [34]:
DT.combine_rating.value_counts()

Left                    136
Unknown / unreliable     98
Center                   48
Right                    16
Name: combine_rating, dtype: int64