# Google Image Search Baseline Analysis for Hillary Clinton and Donald Trump Images

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 3)
plt.rcParams['font.family'] = 'sans-serif'

pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

## Read in Google Scraper search results table 

In [2]:
tmp = pd.read_csv('HC_baseline_hashedDF.csv', nrows=2)
tmp

Unnamed: 0.1,Unnamed: 0,candidate,count,image_file,image_url,news_source,news_source_url,present,image_hash
0,0,hillary_clinton,0,0_hillary_clinton.jpg,http://pixel.nymag.com/imgs/fashion/daily/2016...,New York Magazine,http://nymag.com/thecut/2016/09/hillary-clinto...,1,1d1a1a9e1e1b0d0f
1,1,hillary_clinton,1,1_hillary_clinton.jpg,http://i2.cdn.cnn.com/cnnnext/dam/assets/16101...,CNN.com,http://www.cnn.com/2016/12/21/politics/donald-...,1,efd3d3a9dbd2a1e1


In [3]:
cols = ['candidate', 'image_file', 'image_url', 'news_source', 'news_source_url', 'image_hash']

In [4]:
HC = pd.read_csv('HC_baseline_hashedDF.csv', usecols=cols)
DT = pd.read_csv('DT_baseline_hashedDF.csv', usecols=cols)
print(len(HC))
print(len(DT))

353
298


___
# NEWS SOURCE INFORMATION

In [5]:
HC.news_source.describe()

count                  353
unique                 144
top       Business Insider
freq                    16
Name: news_source, dtype: object

In [6]:
DT.news_source.describe()

count                  298
unique                 110
top       Business Insider
freq                    21
Name: news_source, dtype: object

In [7]:
HC.news_source_url.describe()

count                                                   353
unique                                                  330
top       http://www.harpersbazaar.com/celebrity/red-car...
freq                                                      5
Name: news_source_url, dtype: object

In [8]:
DT.news_source_url.describe()

count                                                   298
unique                                                  282
top       http://www.reviewjournal.com/opinion/editorial...
freq                                                      4
Name: news_source_url, dtype: object

## Get Political Leaning from Allsides.com for Unique News Sources

In [9]:
HC_unique_source_list = pd.DataFrame(HC.news_source.unique().tolist(), columns=['news_source'])

In [10]:
DT_unique_source_list = pd.DataFrame(DT.news_source.unique().tolist(), columns=['news_source'])

In [11]:
allsides = pd.read_json("../../allsides_api/allsides_data.json")
allsides.head()

Unnamed: 0,allsides_url,bias_rating,news_source,url
0,http://www.allsides.com/node/20678,71,Bruce Braley,http://www.brucebraley.com
1,http://www.allsides.com/node/28762,71,Tallahassee Democrat,http://www.tallahassee.com/
2,http://www.allsides.com/node/20582,71,Mark Udall,http://markudall.com/
3,http://www.allsides.com/node/28766,71,Care 2,http://www.care2.com
4,http://www.allsides.com/node/33340,71,Herald Democrat,http://heralddemocrat.com/


In [12]:
HC_unique_source_list.head()

Unnamed: 0,news_source
0,New York Magazine
1,CNN.com
2,The Federali
3,The Huffington Po
4,Salon


In [13]:
def tag_bias_rating(candidate, allsides):
    
    candidate['bias_rating'] = 0

    for i, valuei in enumerate(candidate.news_source):
        for j, valuej in enumerate(allsides.news_source):
            if valuei == valuej.strip('st') or valuei == valuej.strip('s') or valuei == valuej.strip('t'):
                #print(i, valuei, j, valuej)
                
                if allsides.bias_rating[j] == 71: # Left
                    candidate.bias_rating[i] = -2
                    
                elif allsides.bias_rating[j] == 72: # Lean left
                    candidate.bias_rating[i] = -1
                    
                elif allsides.bias_rating[j] == 73: # center
                    candidate.bias_rating[i] = 0
                    
                elif allsides.bias_rating[j] == 74: # lean right
                    candidate.bias_rating[i] = 1      
                    
                elif allsides.bias_rating[j] == 75: # Right
                    candidate.bias_rating[i] = 2                   
                   
                else:
                    candidate.bias_rating[i] == np.nan
 

In [14]:
tag_bias_rating(HC_unique_source_list, allsides)
tag_bias_rating(DT_unique_source_list, allsides)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
HC_unique_source_list.bias_rating.value_counts()

 0    123
-1      8
 2      6
-2      5
 1      2
Name: bias_rating, dtype: int64

In [16]:
HC_unique_source_list[HC_unique_source_list.bias_rating == 2.0]

Unnamed: 0,news_source,bias_rating
10,Breitbar,2
22,The Gateway Pundi,2
40,National Review,2
69,Fox New,2
123,Washington Free Beacon,2
133,Washington Examiner,2


In [17]:
DT_unique_source_list.bias_rating.value_counts()

 0    91
-2     8
-1     6
 2     3
 1     2
Name: bias_rating, dtype: int64

In [18]:
DT_unique_source_list[DT_unique_source_list.bias_rating == 1.0]

Unnamed: 0,news_source,bias_rating
34,Washington Time,1
63,The Federali,1


In [19]:
DT_unique_source_list[DT_unique_source_list.bias_rating == -1.0]

Unnamed: 0,news_source,bias_rating
20,Newsweek,-1
25,Vanity Fair,-1
30,Washington Po,-1
54,NBC New,-1
72,PolitiFac,-1
92,Los Angeles Time,-1


In [20]:
DT_unique_source_list[DT_unique_source_list.bias_rating == 2 ]

Unnamed: 0,news_source,bias_rating
0,Fox New,2
10,Breitbar,2
40,National Review,2


## Getting Political Leaning from Allsides from ALL News Sources (cumulative)

In [21]:
HC['bias_rating'] = ''
tag_bias_rating(HC, allsides)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
DT['bias_rating'] = ''
tag_bias_rating(DT, allsides)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [29]:
DT.bias_rating.value_counts()

 0    218
-2     38
-1     31
 1      6
 2      5
Name: bias_rating, dtype: int64

In [30]:
HC.bias_rating.value_counts()

 0    257
-1     29
-2     28
 1     23
 2     16
Name: bias_rating, dtype: int64

## Save to CSV to be used in "imageAPI_analysis.ipynb"

In [27]:
#HC.to_csv('HC_news.csv', index=False)

In [28]:
#DT.to_csv('DTnews.csv', index=False)