In [1]:
import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup
import os
from acquire import get_blog_articles, get_news_articles

In [2]:
posts = get_blog_articles()


In [3]:
news = get_news_articles(desired_categories=['Business','Sports','Technology','Entertainment'], get_fresh_news=True)


Scraping category:  Business
Total News Articles in Category:  25
Scraping category:  Sports
Total News Articles in Category:  25
Scraping category:  Technology
Total News Articles in Category:  25
Scraping category:  Entertainment
Total News Articles in Category:  25


In [4]:
news[:3]


[{'title': 'Rupee hits record low of 79.97 against US dollar',
  'author': 'Ridham Gambhir',
  'datetime': '2022-07-18T10:00:15.000Z',
  'category': 'business',
  'original': "The rupee hit a record low of 79.97 against the US dollar on Monday after opening at 79.76. The Finance Ministry, while speaking about the matter said that global factors such as the Russia-Ukraine war, soaring crude oil prices and tightening of global financial conditions are the major reasons for the rupee's weakening."},
 {'title': 'Rupee closes at an all-time low of 79.98 against US dollar',
  'author': 'Ridham Gambhir',
  'datetime': '2022-07-18T11:00:17.000Z',
  'category': 'business',
  'original': 'The rupee on Monday hit a fresh record low as it ended closer to the 80-mark to close at 79.98 against the US dollar. This was the seventh consecutive session when the rupee weakened. So far this year, the currency has weakened 7.05% against the US dollar. Meanwhile, BSE Sensex closed 760 points higher at 54,52

## 1.) Codeup Blog Articles

Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:


{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}
Plus any additional properties you think might be helpful.

Bonus: Scrape the text of all the articles linked on codeup's blog page.


In [5]:
url = 'https://codeup.com/blog/'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)


# Make a soup variable holding the response content
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
def get_post_details(post):
    """ Returns dictionary of url, title, date published, and content for each post on the Codeup.com/blog site"""
    output = {}
    # Extract URL
    output['url'] = post.select('a')[0].attrs['href']
    # Extract title
    output['title'] = post.text.strip()
    # Extract date published
    output['date_published'] = post.select('span.published')[0].text
    # Extracts blog post contents
    output['content'] = get_blog_content(output['url'])
    
    return output

def get_blog_content(url):
    """ Returns the content of the blog post """
    headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
    response = get(url, headers=headers)


    # Make a soup variable holding the response content
    soup = BeautifulSoup(response.content, 'html.parser')
    entry_text = ""
    for t in soup.select('div.entry-content'):
        entry_text += t.text.strip()
    return entry_text

def get_blog_articles(return_dataframe = False):
    """ Returns dictionary (or dataframe) of information about blog posts on codeup.com/blog site """
    url = 'https://codeup.com/blog/'
    headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
    response = get(url, headers=headers)


    # Make a soup variable holding the response content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    if return_dataframe:
        return pd.DataFrame([get_post_details(post) for post in soup.select('article.et_pb_post')])

    return [get_post_details(post) for post in soup.select('article.et_pb_post')]


In [7]:
articles = get_blog_articles(return_dataframe=True)
articles.head()

Unnamed: 0,url,title,date_published,content
0,https://codeup.com/featured/what-jobs-can-you-...,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 14, 2022",Have you been considering a career in Cloud Ad...
1,https://codeup.com/data-science/jobs-after-a-c...,What Jobs Can You Get After a Coding Bootcamp?...,"Jul 7, 2022",If you are interested in embarking on a career...
2,https://codeup.com/workshops/san-antonio/in-pe...,In-Person Workshop: Learn to Code – JavaScript...,"Jul 6, 2022",Join us for our live in-person JavaScript cras...
3,https://codeup.com/workshops/in-person-worksho...,In-Person Workshop: Learn to Code – Python on ...,"Jun 20, 2022","According to LinkedIn, the “#1 Most Promising ..."
4,https://codeup.com/workshops/dallas/free-javas...,Free JavaScript Workshop at Codeup Dallas on 6...,"Jun 19, 2022",Event Info: \nLocation – Codeup Dallas\nTime –...


In [8]:
def get_category_news_cards(category):
    """ Returns list with each item the soup for a different news card from the category page"""
    
    # Note that having the category name capitalized sends you to a different website than lowercase!!
    base_url = r'https://inshorts.com/en/read'
    url = base_url +r'/'+category.lower()
    
    headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
    response = get(url, headers=headers)

    # Make a soup variable holding the response content
    cat_soup = BeautifulSoup(response.content, 'html.parser')
    
    return cat_soup.select('div.news-card.z-depth-1')

def get_news_details(news_card, category):
    """ Returns dictionary with information about the article 
    news_card: the soup for an individual news card within a category card stack
    category: this is passed to this function so it can be inputted to the dictionary"""
    
    output={}
    output['headline'] = news_card.select('div.news-card-title')[0].find("span").text
    output['author'] =  news_card.select('div.news-card-author-time')[0].find('span', class_='author').text
    output['datetime'] = news_card.select('div.news-card-author-time')[0].find('span', class_='time').attrs['content']
    output['category'] = category.lower()
    output['content'] = news_card.select('div.news-card-content')[0].find('div').text
    
    return output
    
def get_each_news_in_category(category):
    """ Returns list of dictionaries for each article in the category with article information """
    
    list_of_news_cards = get_category_news_cards(category)
    print("Total News Articles in Category: ",len(list_of_news_cards))
    return [get_news_details(news_card, category) for news_card in list_of_news_cards]
    
def get_news_categories(soup):
    """ Returns list of news categories from the inshorts homepage """
    
    categories = soup.select('ul.category-list')[0].select('li.active-category')[1:]
    
    return [c.text.lower() for c in categories]

def get_news_articles(desired_categories = 'all', update_cache = False):
    """ Returns dictionary of news article information from https://inshorts.com/ .
    desired_categories: 'all' by default or a list of categories desired
    update_cache: if True gets fresh news"""
    
    # Filepath for cache
    news_cache_file = 'news.csv'
    
    if ~update_cache:
        if os.path.exists(news_cache_file):
            return pd.read_csv('news.csv')
        else:
            print("News cache does not exist, acquiring fresh news...")
    
    
    url = 'https://inshorts.com/en/read'
    headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the python-requests default user-agent
    response = get(url, headers=headers)


    # Make a soup variable holding the response content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    categories = get_news_categories(soup)
    
    # Initialize news list
    news = []
    
    # Check if we want articles from all categories or just specific ones
    if desired_categories == 'all':
        
        # Iterate through each category, scraping each article, save details to news list
        for cat in categories:
            
            print("Scraping category: ", cat)
            news+=get_each_news_in_category(cat)
    else:
        # For the case when we only want to scrape articles in particular categories
        for cat in desired_categories:
            # Checks if the desired category exists. If it doesn't moves on to the next category desired
            if cat.lower() not in categories:
                print(cat,"does not exist at site, skipping this category")
                continue
            print("Scraping category: ", cat)
            news+=get_each_news_in_category(cat)
    
    # Write results to cache
    pd.DataFrame(news).to_csv(news_cache_file, index = None)
       
    return news

## 2.) News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

Write a function that scrapes the news articles for the following topics:

- Business
- Sports
- Technology
- Entertainment

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:


{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}

Hints:

a. Start by inspecting the website in your browser. Figure out which elements will be useful.
b. Start by creating a function that handles a single article and produces a dictionary like the one above.
c. Next create a function that will find all the articles on a single page and call the function you created in the last step for every article on the page.
d. Now create a function that will use the previous two functions to scrape the articles from all the pages that you need, and do any additional processing that needs to be done.



In [9]:
news = get_news_articles(desired_categories=['sports','Business','Technology','Entertainment'], update_cache=False)


In [10]:
news

Unnamed: 0,title,author,datetime,category,original
0,Rupee hits record low of 79.97 against US dollar,Ridham Gambhir,2022-07-18T10:00:15.000Z,business,The rupee hit a record low of 79.97 against th...
1,Rupee closes at an all-time low of 79.98 again...,Ridham Gambhir,2022-07-18T11:00:17.000Z,business,The rupee on Monday hit a fresh record low as ...
2,A fighter to the core: Mahindra praises PV Sin...,Ridham Gambhir,2022-07-17T08:17:31.000Z,business,Businessman Anand Mahindra took to Twitter to ...
3,"BCCI had ₹40 cr in bank when I joined & ₹47,68...",Ridham Gambhir,2022-07-17T06:35:36.000Z,business,"In an Instagram post, Lalit Modi asserted that..."
4,It is not a sacrifice at all: Bill Gates on pl...,Hiral Goyal,2022-07-17T06:38:37.000Z,business,"Microsoft Co-founder Bill Gates, who plans to ..."
...,...,...,...,...,...
95,"Singers want to be like Arijit today, they're ...",Udit Gupta,2022-07-18T10:06:33.000Z,entertainment,Folk artist Mame Khan has said that he doesn't...
96,"Rejected 'Judwaa 2', didn't want to do 'bachpa...",Udit Gupta,2022-07-18T10:13:41.000Z,entertainment,Actor Siddharth Nigam has revealed that he was...
97,'PS-1' director Mani Ratnam receives legal not...,Amartya Sharma,2022-07-18T10:25:04.000Z,entertainment,Filmmaker Mani Ratnam and actor Vikram have re...
98,Grace & style of Dhanush is something to behol...,Amartya Sharma,2022-07-18T10:27:56.000Z,entertainment,"Regé-Jean Page, speaking about Dhanush in 'The..."



## 3.) Bonus: cache the data

Write your code such that the acquired data is saved locally in some form or fashion. Your functions that retrieve the data should prefer to read the local data instead of having to make all the requests everytime the function is called. Include a boolean flag in the functions to allow the data to be acquired "fresh" from the actual sources (re-writing your local cache).