Foundation Project - Indian School of Business.

This code will scrape news from different news articles based on search term . Here we are collecting news related to amazon.

### Install Required Packages

In [1]:
! pip install requests
! pip install beautifulsoup4



### Collect URLs for Search Terms

In [2]:
from bs4 import BeautifulSoup
import requests
import re

list_of_search_terms = ["amazon", 
                        "amazon workers dissatisfaction",
                        "amazon innovation",
                        "amazon customer satisfaction",
                        "amazon India",
                        "amazon covid19"
                        ]

url = "https://www.bing.com/news/search?q={0}"
compiled_urls = {}

for search_word in list_of_search_terms:
    compiled_urls[search_word] = set()
    formatted_search_word = search_word.replace(" ", "+")
    response = requests.get(url.format(formatted_search_word))
    soup = BeautifulSoup(response.text, 'html.parser')
    url_search_results = soup.find_all('a', href=True)
    
    bad_urls = ["javascript:", "/news/search?q=", "/rewards", "/images", "/videos", "/maps", "/shop", "/profile", 
               "/search", "/?FORM","/news?FORM=HDRSC6"]
    
    for url_s in url_search_results:
        if ((not url_s['href'].startswith(tuple(bad_urls))) and 
        ("go.microsoft.com" not in url_s['href']) and 
        (url_s['href'] != "#")): 
            compiled_urls[search_word].add(url_s['href'])

In [3]:
# Example set of collected URLS:
compiled_urls["amazon workers dissatisfaction"]

{'https://www.globalbankingandfinance.com/from-legacy-to-digital-how-financial-institutions-can-make-the-transition/',
 'https://www.piworld.com/article/tracking-the-latest-trends-impacting-the-mail/',
 'https://www.politico.eu/article/muddled-messaging-fuels-backlash-against-lockdown-in-france/',
 'https://www.politicsweb.co.za/iservice/brazils-rollercoaster-2014-election',
 'https://www.traveller.com.au/how-to-spot-fake-reviews-on-travel-websites-h1rsgv'}

### Gather Article Information

In [7]:
import csv

rows = []
for search_word in compiled_urls:
    urls = compiled_urls[search_word]
    for url in urls:
        print(url)
        # Retrieve page text
        try:
            page = requests.get(url, timeout=None).text
#         except requests.exceptions.Timeout:
#             # Timeout occurred
#             row = [search_word, url, "", ""]
#             rows.append(row)
#             break
            # Turn page into BeautifulSoup object to access HTML tags
            soup = BeautifulSoup(page)

            print(url)

            # Get headline
            if soup.find('h1') != None:
                headline = soup.find('h1').get_text()
            else:
                headline = ""

            # Get text from all <p> tags.
            p_tags = soup.find_all('p')
            # Get the text from each of the “p” tags and strip surrounding whitespace.
            if p_tags != None:
                p_tags_text = [tag.get_text().strip() for tag in p_tags]
            else:
                p_tags_text = ""

            # Filter out sentences that contain newline characters '\n' or don't contain periods.
            sentence_list = [sentence for sentence in p_tags_text if not '\n' in sentence]
            sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
            # Combine list items into string.
            article = ' '.join(sentence_list)

            row = [search_word, url, headline, article]
            rows.append(row)
        except:
            print("not able to scrape the site")

https://www.moneycontrol.com/news/business/amazon-not-entitled-to-object-cannot-control-frl-board-harish-salve-tells-delhi-high-court-6136131.html
https://www.moneycontrol.com/news/business/amazon-not-entitled-to-object-cannot-control-frl-board-harish-salve-tells-delhi-high-court-6136131.html
https://www.gizbot.com/apps/news/amazon-quiz-answers-for-november-19-here-s-your-chance-to-win-rs-20-000-amazon-pay-balance-071070.html
https://www.gizbot.com/apps/news/amazon-quiz-answers-for-november-19-here-s-your-chance-to-win-rs-20-000-amazon-pay-balance-071070.html
https://www.businessinsider.in/business/ecommerce/news/amazon-layoffs-several-employees-in-prime-air-drone-project-report/articleshow/79305510.cms
https://www.businessinsider.in/business/ecommerce/news/amazon-layoffs-several-employees-in-prime-air-drone-project-report/articleshow/79305510.cms
https://indianexpress.com/article/technology/tech-news-technology/amazon-adds-hindi-support-for-alexa-on-fire-tv-all-you-need-to-know/
https

https://www.indiainfoline.com/article/general-blog/seven-megatrends-that-you-could-see-in-business-2021-120110600414_1.html
https://www.scoop.co.nz/stories/BU2011/S00347/vodafone-nz-to-roll-out-amazon-connect-in-contact-centres-to-further-improve-customer-experience.htm
https://www.scoop.co.nz/stories/BU2011/S00347/vodafone-nz-to-roll-out-amazon-connect-in-contact-centres-to-further-improve-customer-experience.htm
https://yourstory.com/2020/11/amazon-india-launches-step-accelerate-growth-sellers
https://yourstory.com/2020/11/amazon-india-launches-step-accelerate-growth-sellers
https://www.thenewsminute.com/article/amazon-india-launches-performance-based-benefits-program-7-lakh-sellers-137745
https://www.thenewsminute.com/article/amazon-india-launches-performance-based-benefits-program-7-lakh-sellers-137745
https://www.livemint.com/companies/news/amazon-india-launches-new-programme-to-assess-sellers-11605532660785.html
https://www.livemint.com/companies/news/amazon-india-launches-new-pr

### Output to CSV File for Ingestion Team

In [8]:
fields = ["Search Term", "Source", "Title", "Article Content"]
filename = "ws1_web_crawler_results.csv"

with open(filename, 'w',encoding="utf-8") as csvfile: 
    csvwriter = csv.writer(csvfile)  
    csvwriter.writerow(fields)   
    csvwriter.writerows(rows)
    
# with open(fname, "w", encoding="utf-8") as csvfile:
#     f.write(html)

In [None]:
pwd