### Install Required Packages

In [1]:
! pip install requests
! pip install beautifulsoup4

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### Collect URLs for Search Terms

In [3]:
from bs4 import BeautifulSoup
import requests
import re

list_of_search_terms = ["police misconduct", "officer arrested", "officer charged", "officer fired",
                        "officer terminated", "officer disciplinary", "police lawsuit", "police settlement",
                        "sheriff lawsuit", "sheriff arrested", "sheriff charged", "deputy fired", "deputy arrested",
                        "police sued", "sheriff sued", "racial bias", "officer racism", "sheriff racism",
                        "illegal search", "excessive force", "false arrest", "wrong door raid", "police civil case",
                        "sheriff civil case", "police brutality"]

url = "https://www.bing.com/news/search?q={0}"
compiled_urls = {}

for search_word in list_of_search_terms:
    compiled_urls[search_word] = set()
    formatted_search_word = search_word.replace(" ", "+")
    response = requests.get(url.format(formatted_search_word))
    soup = BeautifulSoup(response.text, 'html.parser')
    url_search_results = soup.find_all('a', href=True)
    
    bad_urls = ["javascript:", "/news/search?q=", "/rewards", "/images", "/videos", "/maps", "/shop", "/profile", 
               "/search", "/?FORM"]
    
    for url_s in url_search_results:
        if ((not url_s['href'].startswith(tuple(bad_urls))) and 
        ("go.microsoft.com" not in url_s['href']) and 
        ("coronaviruslinks" not in url_s['href']) and 
        (url_s['href'] != "#")): 
            compiled_urls[search_word].add(url_s['href'])

In [6]:
# Example set of collected URLS:
compiled_urls["police misconduct"]

{'http://www.dailyjournal.net/2020/06/26/us-racial-injustice-police-misconduct/',
 'https://patch.com/massachusetts/worcester/worcester-pd-use-force-incident-one-36-under-investigation',
 'https://www.latimes.com/world-nation/story/2020-06-26/us-police-registry-would-fail-without-changes-in-states',
 'https://www.masslive.com/news/2020/07/springfield-police-officer-who-forced-man-out-of-station-by-the-throat-pleads-guilty-to-assault-incident-prompts-federal-lawsuit.html',
 'https://www.mercurynews.com/2020/07/21/bay-area-news-group-sues-san-jose-for-failure-to-release-police-discipline-use-of-force-records/',
 'https://www.miamiherald.com/news/local/community/miami-dade/article244081767.html',
 'https://www.newsday.com/long-island/nassau/police-misconduct-hotline-nassau-legislators-1.46504030',
 'https://www.nytimes.com/aponline/2020/07/23/us/ap-us-police-accountability-new-york.html',
 'https://www.nytimes.com/reuters/2020/07/24/business/24reuters-global-race-police-insurance-focus.ht

### Gather Article Information

In [17]:
import csv

rows = []
for search_word in compiled_urls:
    urls = compiled_urls[search_word]
    for url in urls:
        # Retrieve page text
        try:
            page = requests.get(url, timeout=None).text
        except requests.exceptions.Timeout:
            # Timeout occurred
            row = [search_word, url, "", ""]
            rows.append(row)
            break

        # Turn page into BeautifulSoup object to access HTML tags
        soup = BeautifulSoup(page)

        print(url)
        
        # Get headline
        if soup.find('h1') != None:
            headline = soup.find('h1').get_text()
        else:
            headline = ""

        # Get text from all <p> tags.
        p_tags = soup.find_all('p')
        # Get the text from each of the “p” tags and strip surrounding whitespace.
        if p_tags != None:
            p_tags_text = [tag.get_text().strip() for tag in p_tags]
        else:
            p_tags_text = ""

        # Filter out sentences that contain newline characters '\n' or don't contain periods.
        sentence_list = [sentence for sentence in p_tags_text if not '\n' in sentence]
        sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
        # Combine list items into string.
        article = ' '.join(sentence_list)

        row = [search_word, url, headline, article]
        rows.append(row)

https://www.nytimes.com/reuters/2020/07/24/business/24reuters-global-race-police-insurance-focus.html
https://lasvegassun.com/news/2020/jul/24/henderson-police-officer-arrested-after-dui-crash/
https://www.usatoday.com/story/news/nation/2020/07/23/derek-chauvin-ex-minneapolis-cop-george-floyd-death-tax-charges/5492181002/
https://www.foxnews.com/sports/rays-tweet-arrest-fatal-shooting-breonna-taylor-opening-day
https://abcnews.go.com/US/derek-chauvin-officer-accused-killing-george-floyd-charged/story?id=71941032
https://www.msn.com/en-us/news/crime/dallas-police-officer-arrested-on-charge-of-transportation-of-child-pornography/ar-BB179sHx
https://www.foxnews.com/us/philly-da-warns-feds-arrested-storm-protests
https://www.msn.com/en-us/news/crime/dallas-officer-arrested-on-charge-of-transportation-of-child-pornography-police-say/ar-BB1791WJ
https://www.stltoday.com/news/watch-now-two-velda-city-police-officers-charged-with-first-degree-assault/video_25965f40-fdcf-5294-8c74-c2251ce17815.

https://www.chicagotribune.com/opinion/commentary/ct-opinion-police-misconduct-settlements-raja-krishnamoorthi-20200720-f63im4kx2rcpjfqt6y4wtnlkqy-story.html
https://www.usatoday.com/in-depth/news/nation/2020/07/24/rubber-bullets-less-lethal-weapons-victims-police-protesters-decades/5410519002/
https://www.startribune.com/las-vegas-police-reach-2-2m-settlement-in-chokehold-death/571810202/
https://www.msn.com/en-us/news/us/seattle-police-chief-new-limits-on-anti-protest-gear-will-mean-adjusted-law-enforcement/ar-BB17aBe9
https://www.msn.com/en-us/news/crime/federal-judge-blocks-seattle-council-s-law-banning-police-anti-riot-gear/ar-BB17ancx
https://www.foxnews.com/politics/seattle-city-councils-limits-on-police-anti-protest-gear-can-proceed-judge-says-report
https://www.washingtonpost.com/local/legal-issues/family-of-man-fatally-shot-by-police-reaches-35-million-settlement/2020/07/21/da918e9a-cb6c-11ea-bc6a-6841b28d9093_story.html
https://www.nytimes.com/aponline/2020/07/22/us/ap-us-po

https://www.news4jax.com/news/local/2020/07/22/lawsuit-against-rnc-adds-sheriff-williams-concerns-to-legal-arguments/
https://www.nbcnews.com/tech/tech-news/facebook-management-ignored-internal-research-showing-racial-bias-current-former-n1234746
https://www.washingtontimes.com/news/2020/jul/22/police-bias-training-push-has-startups-racing-cash/
https://www.msn.com/en-us/news/us/past-time-obama-urges-biden-to-address-racial-bias-in-policing-in-video-contrast-with-trump/ar-BB176ufo
https://www.consumerreports.org/car-insurance/car-insurance-rates-to-be-studied-for-racial-bias/
https://news.yahoo.com/trump-repeals-fair-housing-rule-200513035.html
https://thehill.com/policy/technology/508780-facebook-researchers-say-their-findings-on-racial-bias-ignored-by-superiors
https://www.msn.com/en-us/news/technology/facebook-to-look-more-closely-at-potential-racial-bias-on-its-platforms/ar-BB171vYU
https://www.wsj.com/articles/insurance-group-to-scrutinize-rate-guidelines-for-racial-bias-115954948

https://www.msn.com/en-us/sports/nba/lebron-james-uses-media-interview-after-first-scrimmage-to-shed-light-on-justice-for-breonna-taylor/ar-BB177Z9h
https://kdvr.com/news/national/a-key-miscalculation-by-officers-contributed-to-the-tragic-death-of-breonna-taylor/
https://www.postandcourier.com/opinion/editorials/editorial-limit-which-sc-judges-can-issue-no-knock-warrants-to-authorize-police-raids/article_b0e34ca6-c5e9-11ea-9640-ff165d08a511.html
https://www.thestar.co.uk/news/crime/women-reveal-horror-having-gun-pointed-them-terrifying-house-raid-sheffield-2911976
https://www.journalnow.com/news/national/officers-in-deadly-breonna-taylor-raid-thought-she-was-alone/article_da080a5c-59ee-56a0-bd00-69a15734e939.html
https://www.stuff.co.nz/nelson-mail/122132836/burglar-raids-pharmacy-twice-in-one-morning-after-stealing-wrong-medication
https://www.wdrb.com/in-depth/lawsuit-lmpd-swat-team-raids-vacant-home-handcuffs-wrong-couple-days-after-suspect-already-arrested/article_ca6bf77a-c6a9-11e

### Output to CSV File for Ingestion Team

In [18]:
fields = ["Search Term", "Source", "Title", "Article Content"]
filename = "ws1_web_crawler_results.csv"

with open(filename, 'w') as csvfile: 
    csvwriter = csv.writer(csvfile)  
    csvwriter.writerow(fields)   
    csvwriter.writerows(rows)