In [None]:
# Important: Code works only in Python 2
# Google Search API: https://github.com/abenassi/Google-Search-API
# tldextract: https://pypi.python.org/pypi/tldextract

# I installed these packages to run this code
# pip install Google-Search-API
# pip install tldextract
# pip install urlparse

This notebook get for each query in string format the top results in Google Search, check if the results are in the Master List provided by Southern Poverty Law Center, and return the number of times they appear for each search.

Inputs: 
- Master List -> HateDomainMasterList_Clean.csv
- List of Queries -> ['are immigrants more likely to be on welfare', 'are immigrants allowed to vote', ...] (example)

Output: 
- Table with all the results from queries, the rank they appear in the search and if it is in masterlist. (exported in .csv)



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tldextract
from google import google
from collections import Counter
from urlparse import urlparse

%matplotlib inline

ImportError: No module named tldextract

### Functions

Step-by-step explanation of this function at the end of the notebook (Appendix)

Function inputs: Query, Master List, Number of pages on Google Search to query)

In [11]:
def search(query, master_list, pages = 2): 
    # Getting search results
    links = []
    query = query
    num_page = pages
    search_results = google.search(query, num_page)
    for term in search_results:
        links.append(term.link)

    # Getting domain of the links
    root = []
    for term in search_results:
        extracted = tldextract.extract(term.link)
        root.append("{}.{}".format(extracted.domain, extracted.suffix))

    # Checking which are in masterlist
    res = []
    in_list = []
    for site in root:
        in_list.append('No')
        extracted = tldextract.extract(site)
        if "{}.{}".format(extracted.domain, extracted.suffix) in ['google.com', 'facebook.com', 
                                                                  'wordpress.com', 'blogspot.com']:
            pass
        else:
            if "{}.{}".format(extracted.domain, extracted.suffix) in master_list:
                res.append("{}.{}".format(extracted.domain, extracted.suffix))
                in_list = in_list[:-1]
                in_list.append('Yes')     
    print "Query: {} Links Found: {}".format(query, Counter(res))

    # Returning consolidated dataframe
    search_term = [query] * len(search_results)
    domain = root
    rank = list(np.arange(len(search_results)) + 1)

    columns = {'Query': search_term, 'Results': links, 'Page Domain': root, 
               'Rank': rank, "In Master List": in_list}
    results = pd.DataFrame(columns)
    results = results[[u'Query', u'Results', u'Page Domain', u'Rank', u'In Master List']]

    return results
    

### Loading Master List


In [15]:
master = pd.read_csv('starting dataset(s)/HateDomainMasterList_Clean.csv')
master = master.dropna(subset = ['Website'])

In [16]:
master.head()

Unnamed: 0,Group,Ideology,Website,Website 2,Website 3
0,American Border Patrol,Anti-Immigrant,http://www.americanborderpatrol.com,,
1,American Immigration Control Foundation,Anti-Immigrant,http://www.immigrationcontrol.com,,
2,Americans for Legal Immigration (ALIPAC),Anti-Immigrant,http://www.alipac.us,,
3,Borderkeepers of Alabama,Anti-Immigrant,http://www.borderkeepersofalabama.org,,
4,Californians for Population Stabilization,Anti-Immigrant,http://www.capsweb.org,,


In [None]:
domain = []
for term in master['Website']:
    extracted = tldextract.extract(term)
    domain.append("{}.{}".format(extracted.domain, extracted.suffix))
master['Domain'] = domain

In [17]:
master_list = list(master['Domain'])
master_list = cleanedList = [x for x in master_list if str(x) != 'nan']

### Individual query results

Run a function that gather the results from first n pages (standard is 2) of Google Search, check which ones are in Master List and plot a dataframe to show results

In [26]:
results = search('bias against white males', master_list, pages = 2)
results

Query: are immigrants allowed to vote Links Found: Counter({'fairus.org': 1})


Unnamed: 0,Query,Results,Page Domain,Rank,In Master List
0,are immigrants allowed to vote,https://en.wikipedia.org/wiki/Right_of_foreign...,wikipedia.org,1,No
1,are immigrants allowed to vote,http://www.newsweek.com/immigrants-are-getting...,newsweek.com,2,No
2,are immigrants allowed to vote,http://www.politifact.com/florida/statements/2...,politifact.com,3,No
3,are immigrants allowed to vote,https://www.thoughtco.com/can-i-vote-1951751,thoughtco.com,4,No
4,are immigrants allowed to vote,https://fairus.org/issue/societal-impact/nonci...,fairus.org,5,Yes
5,are immigrants allowed to vote,http://www.foxnews.com/us/2017/09/13/illegal-i...,foxnews.com,6,No
6,are immigrants allowed to vote,http://www.foxnews.com/politics/2017/09/16/mar...,foxnews.com,7,No
7,are immigrants allowed to vote,http://time.com/4859478/immigrant-voters-history/,time.com,8,No
8,are immigrants allowed to vote,http://time.com/4669899/illegal-citizens-votin...,time.com,9,No
9,are immigrants allowed to vote,https://www.washingtontimes.com/news/2016/apr/...,washingtontimes.com,10,No


### Multiple query results
Accepts a list of strings with queries and run it over a for loop that run a function that gather the results from first n pages (standard is 2) of Google Search, check which ones are in Master List and plot a dataframe to show results

In [33]:
query_list = [
'are immigrants more likely to be on welfare',
'are immigrants allowed to vote',
'cost of illegal immigration',
'map of sanctuary cities',
'percentage of immigrants on welfare',
'how do illegal immigrants get welfare',
'number of deportations by year',
'history of us immigration policy',
'can illegal aliens get welfare',
'us citizens losing jobs to immigrants',
"is homosexuality a sin",
"Allahu Akbar",
"transgender bathroom",
"are you born gay",
"islam mohammed",
"Black lives matter", 
"Where Obama was born",
"gay people in military",
"Confederate statues taken down", 
"White and slavery in US", 
"Percentage of minorities in US"
]

In [34]:
consolidated = pd.DataFrame()
for q in query_list:
    results = search(q, master_list, pages = 2)
    consolidated = pd.concat([consolidated, results])
print "\n Consolidated count:", Counter(consolidated[consolidated['In Master List'] == 'Yes']['Page Domain'])
consolidated = consolidated.reset_index().drop('index', 1) # reseting index
consolidated

Query: are immigrants more likely to be on welfare Links Found: Counter({'cis.org': 2, 'fairus.org': 1})
Query: are immigrants allowed to vote Links Found: Counter({'fairus.org': 1})
Query: cost of illegal immigration Links Found: Counter({'fairus.org': 1, 'cis.org': 1})
Query: map of sanctuary cities Links Found: Counter({'cis.org': 2})
Query: percentage of immigrants on welfare Links Found: Counter({'fairus.org': 1, 'cis.org': 1})
Query: how do illegal immigrants get welfare Links Found: Counter({'fairus.org': 1, 'cis.org': 1})
Query: number of deportations by year Links Found: Counter({'cis.org': 1})
Query: history of us immigration policy Links Found: Counter({'cis.org': 2, 'fairus.org': 1, 'history.com': 1, 'cairco.org': 1})
Query: can illegal aliens get welfare Links Found: Counter({'fairus.org': 1, 'cis.org': 1})
Query: us citizens losing jobs to immigrants Links Found: Counter({'cis.org': 1})
Query: is homosexuality a sin Links Found: Counter()
Query: Allahu Akbar Links Found: 

Unnamed: 0,Query,Results,Page Domain,Rank,In Master List
0,are immigrants more likely to be on welfare,https://cis.org/Report/Welfare-Use-Immigrant-a...,cis.org,1,Yes
1,are immigrants more likely to be on welfare,https://cis.org/Report/Cost-Welfare-Use-Immigr...,cis.org,2,Yes
2,are immigrants more likely to be on welfare,https://www.vox.com/policy-and-politics/2017/8...,vox.com,3,No
3,are immigrants more likely to be on welfare,https://www.usatoday.com/story/news/nation/201...,usatoday.com,4,No
4,are immigrants more likely to be on welfare,https://fairus.org/issue/workforce-and-economy...,fairus.org,5,Yes
5,are immigrants more likely to be on welfare,https://newrepublic.com/article/122714/immigra...,newrepublic.com,6,No
6,are immigrants more likely to be on welfare,https://www.cato.org/blog/cis-exaggerates-cost...,cato.org,7,No
7,are immigrants more likely to be on welfare,http://www.nationalreview.com/corner/444543/im...,nationalreview.com,8,No
8,are immigrants more likely to be on welfare,https://www.washingtonpost.com/world/national-...,washingtonpost.com,9,No
9,are immigrants more likely to be on welfare,https://www.brookings.edu/research/welfare-ben...,brookings.edu,10,No


### Filtering only the ones that appear in Master List

In [35]:
consolidated[consolidated['In Master List'] == "Yes"]

Unnamed: 0,Query,Results,Page Domain,Rank,In Master List
0,are immigrants more likely to be on welfare,https://cis.org/Report/Welfare-Use-Immigrant-a...,cis.org,1,Yes
1,are immigrants more likely to be on welfare,https://cis.org/Report/Cost-Welfare-Use-Immigr...,cis.org,2,Yes
4,are immigrants more likely to be on welfare,https://fairus.org/issue/workforce-and-economy...,fairus.org,5,Yes
4,are immigrants allowed to vote,https://fairus.org/issue/societal-impact/nonci...,fairus.org,5,Yes
0,cost of illegal immigration,https://fairus.org/issue/publications-resource...,fairus.org,1,Yes
6,cost of illegal immigration,https://cis.org/Report/Deportation-vs-Cost-Let...,cis.org,7,Yes
0,map of sanctuary cities,https://cis.org/Map-Sanctuary-Cities-Counties-...,cis.org,1,Yes
1,map of sanctuary cities,https://cis.org/sites/cis.org/files/Sanctuary-...,cis.org,2,Yes
0,percentage of immigrants on welfare,https://cis.org/Report/Welfare-Use-Immigrant-a...,cis.org,1,Yes
8,percentage of immigrants on welfare,https://fairus.org/issue/workforce-and-economy...,fairus.org,9,Yes


In [36]:
# Exporting to .csv
consolidated.to_csv('starter_list.csv')

In [37]:
# Count of occurences
consolidated[consolidated['In Master List'] == 'Yes'].groupby('Query').count()['Results']

Query
White and slavery in US                        2
are immigrants allowed to vote                 1
are immigrants more likely to be on welfare    3
can illegal aliens get welfare                 2
cost of illegal immigration                    2
history of us immigration policy               5
how do illegal immigrants get welfare          2
map of sanctuary cities                        2
number of deportations by year                 1
percentage of immigrants on welfare            2
us citizens losing jobs to immigrants          1
Name: Results, dtype: int64

## Appendix

### Explaining the function step-by-step

#### Getting links from Google Search
Receive the Query (in string format) and return the results from n pages

In [14]:
links = []
query = "are immigrants allowed to vote"
num_page = 2
search_results = google.search(query, num_page)
for term in search_results:
    links.append(term.link)
    print term.link

https://en.wikipedia.org/wiki/Right_of_foreigners_to_vote_in_the_United_States
http://www.newsweek.com/immigrants-are-getting-right-vote-cities-across-america-664467
http://www.politifact.com/florida/statements/2017/sep/15/adam-putnam/san-francisco-takoma-park-noncitizens-elections/
https://www.thoughtco.com/can-i-vote-1951751
https://fairus.org/issue/societal-impact/noncitizens-voting-violations-and-us-elections
http://www.foxnews.com/politics/2017/09/16/maryland-city-rescinds-vote-that-allowed-illegal-immigrants-to-ballot-in-local-elections.html
http://www.foxnews.com/us/2017/09/13/illegal-immigrants-get-ok-to-vote-in-maryland-citys-elections.html
http://time.com/4859478/immigrant-voters-history/
https://www.washingtontimes.com/news/2016/apr/4/supreme-court-illegals-count-apportionment/
http://www.nationalreview.com/corner/450417/maryland-illegal-immigration-and-voting
http://joshblackman.com/blog/2013/05/09/does-the-u-s-constitution-allow-non-citizens-to-vote/
http://immigration.fin

#### Getting Top Level Domain (TLD) from those links 

In [26]:
root = []
for term in search_results:
    extracted = tldextract.extract(term.link)
    root.append("{}.{}".format(extracted.domain, extracted.suffix))
root

['wikipedia.org',
 'newsweek.com',
 'politifact.com',
 'thoughtco.com',
 'fairus.org',
 'foxnews.com',
 'foxnews.com',
 'time.com',
 'washingtontimes.com',
 'nationalreview.com',
 'joshblackman.com',
 'findlaw.com',
 'thinkprogress.org',
 'latimes.com',
 'snopes.com',
 'baltimoresun.com',
 'thedailybeast.com',
 'fox5dc.com',
 'telegraph.co.uk',
 'npr.org']

#### Check which TLD are in the Master List and print them 

In [27]:
res = []
in_list = []
for site in root:
    in_list.append('No')
    extracted = tldextract.extract(site)
    if "{}.{}".format(extracted.domain, extracted.suffix) in '.'.join(master_list):
        res.append("{}.{}".format(extracted.domain, extracted.suffix))
        in_list = in_list[:-1]
        in_list.append('Yes')     
Counter(res)

Counter({'fairus.org': 1})

#### Plot the results in a Pandas DataFrame

In [24]:
search_term = [query] * len(search_results)
domain = root
rank = list(np.arange(len(search_results)) + 1)

columns = {'Query': search_term, 'Results': links, 'Page Domain': root, 
           'Rank': rank, "In Master List": in_list}
results = pd.DataFrame(columns)
results = results[[u'Query', u'Results', u'Page Domain', u'Rank', u'In Master List']]
results

Unnamed: 0,Query,Results,Page Domain,Rank,In Master List
0,are immigrants allowed to vote,https://en.wikipedia.org/wiki/Right_of_foreign...,wikipedia.org,1,No
1,are immigrants allowed to vote,http://www.newsweek.com/immigrants-are-getting...,newsweek.com,2,No
2,are immigrants allowed to vote,http://www.politifact.com/florida/statements/2...,politifact.com,3,No
3,are immigrants allowed to vote,https://www.thoughtco.com/can-i-vote-1951751,thoughtco.com,4,No
4,are immigrants allowed to vote,https://fairus.org/issue/societal-impact/nonci...,fairus.org,5,Yes
5,are immigrants allowed to vote,http://www.foxnews.com/politics/2017/09/16/mar...,foxnews.com,6,No
6,are immigrants allowed to vote,http://www.foxnews.com/us/2017/09/13/illegal-i...,foxnews.com,7,No
7,are immigrants allowed to vote,http://time.com/4859478/immigrant-voters-history/,time.com,8,No
8,are immigrants allowed to vote,https://www.washingtontimes.com/news/2016/apr/...,washingtontimes.com,9,No
9,are immigrants allowed to vote,http://www.nationalreview.com/corner/450417/ma...,nationalreview.com,10,No
