In [1]:
import pandas as pd
import numpy as np

from requests import get
import re
from bs4 import BeautifulSoup
import os

### Codeup Blogs

**Goals:** Write a function to scrape urls from main Codeup blog web page and write a function that returns a dictionary of blog titles and text for each blog page. 

#### Grab Title from Page

Here I use the `.find()` method on my soup with the `<h1>` tag and its `itemprop` attribute equal to `headline`. As always, there is no one way to accomplish our task, so I'm demonstrating one way to scrape the headline, not THE way to scrape the headline. 

In [3]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} 
    
response = get(url, headers=headers)
response.ok

True

In [3]:
# Here's our long string; we'll use this to make our soup object

print(type(response.text))

<class 'str'>


In [5]:
# Use BeautifulSoup using our response string

soup = BeautifulSoup(response.text, 'html.parser')

# Now we have our BeautifulSoup object, we can use its built-in methods and properties

print(type(soup))

<class 'bs4.BeautifulSoup'>


In [92]:
soup.select('h1')[0].text

'Codeup’s Data Science Career Accelerator is Here!'

In [93]:
# This is what the h1 element contains. I want to access the itemprop headline

title = soup.find('h1').text
title

'Codeup’s Data Science Career Accelerator is Here!'

In [32]:
print(type(title))

<class 'str'>


#### Grab Text from Page

In [96]:
class_text = soup.find('div', class_="jupiterx-post-content").text
print(class_text)

The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in Glassdoor’s #1 Best Job in America.
Data Science is a method of providing actionable intelligence from data. The data revolution has hit San Antonio, resulting in an explosion in Data Scientist positions across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen UTSA invest $70 M for a Cybersecurity Center and School of Data Science. We built a program to specifically meet the growing demands of this industry.
Our program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Students will work with real

In [34]:
print(type(content))

<class 'str'>


#### Build Blog Function

In [35]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

def get_blog_articles(urls, cached=True):
    '''
    This function takes in a list of Codeup Blog urls and a parameter
    with default cache == False which returns a df from a csv file.
    If cache == True, the function scrapes the title and text for each url, 
    creates a list of dictionaries with the title and text for each blog, 
    converts list to df, and returns df.
    '''
    if cached == True:
        df = pd.read_json('big_blogs.json', index_col=0)
    else:
        headers = {'User-Agent': 'Codeup Bayes Data Science'} 

        # Create an empty list to hold dictionaries
        articles = []

        # Loop through each url in our list of urls
        for url in urls:

            # get request to each url saved in response
            response = get(url, headers=headers)

            # Create soup object from response text and parse
            soup = BeautifulSoup(response.text, 'html.parser')

            # Save the title of each blog in variable title
            title = soup.find('h1', itemprop='headline').text

            # Save the text in each blog to variable text
            content = soup.find('div', itemprop='text').text

            # Create a dictionary holding the title and content for each blog
            article = {'title': title, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('big_blogs.json')
    
    return df

#### Test Function

In [37]:
# Here cache == True, so the function will do a fresh scrape of the urls

blogs = get_blog_articles(urls=urls, cached=False)
blogs

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie GiustData Scien...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeup launched..."
3,10 Tips to Crush It at the SA Tech Job Fair,10 Tips to Crush It at the SA Tech Job FairSA ...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


#### Bonus URL Scrape

In [101]:
# I'm going to hit Codeup's main blog page to scrape the urls

url = 'https://codeup.com/resources/#blog'
headers = {'User-Agent': 'Codeup Data Science'} 

# Request the HTML
response = get(url, headers=headers)

# Create the soup object to parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

In [102]:
# I'm using the `a` element with class_ to get a list of tag elements from my soup object

link_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
link_list[:2]

[<a class="jet-listing-dynamic-link__link" href="https://codeup.com/introducing-salary-refund-guarantee/"><span class="jet-listing-dynamic-link__label">Introducing Our Salary Refund Guarantee</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/introducing-salary-refund-guarantee/"><span class="jet-listing-dynamic-link__label">Read More</span></a>]

In [103]:
# Using find_all has returned a bs ResultSet with 99 bs tags inside

print(f'Our variable link_list is a {type(link_list)}.')
print(f'Our element ResultSet is made up of {type(link_list[0])}.')
print(f'Our ResultSet contains {len(link_list)} element tags.')

Our variable link_list is a <class 'bs4.element.ResultSet'>.
Our element ResultSet is made up of <class 'bs4.element.Tag'>.
Our ResultSet contains 40 element tags.


In [104]:
# Create empty urls list and for each tag above, grab the href/link
# Add each link to the urls list

urls = []
for link in link_list:
    urls.append(link.get('href'))

In [105]:
# Wow, 99 links! Ready to scrape titles and text from each

print(len(urls))
urls[:10]

40


['https://codeup.com/introducing-salary-refund-guarantee/',
 'https://codeup.com/introducing-salary-refund-guarantee/',
 'https://codeup.com/new-scholarship/',
 'https://codeup.com/new-scholarship/',
 'https://codeup.com/codeup-in-houston/',
 'https://codeup.com/codeup-in-houston/',
 'https://codeup.com/math-in-data-science/',
 'https://codeup.com/math-in-data-science/',
 'https://codeup.com/covid-19-data-challenge/',
 'https://codeup.com/covid-19-data-challenge/']

#### Bonus URL Function

In [17]:
def get_all_urls():
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    # The main Codeup blog page with all the urls
    url = 'https://codeup.com/resources/#blog'
    
    headers = {'User-Agent': 'Codeup Data Science'} 
    
    # Send request to main page and get response
    response = get(url, headers=headers)
    
    # Create soup object using response
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Create empty list to hold the urls for all blogs
    urls = []
    
    # Create a list of the element tags that hold the href/links
    link_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
    
    # get the href/link from each element tag in my list
    for link in link_list:
        
        # Add the link to my urls list
        urls.append(link.get('href'))
        
    return urls

In [18]:
# Now I can use my same function with my new urls list function!
# cache == True does a fresh scrape.

big_blogs = get_blog_articles(urls=get_all_urls(), cache=True)

In [19]:
big_blogs.head()

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...


In [20]:
big_blogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    99 non-null     object
 1   content  99 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB


In [21]:
# cache == False reads in a df from the `big_blogs.csv`.

big_blogs = get_blog_articles(urls=get_all_urls(), cache=False)
big_blogs.head()

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...


### Inshorts News Articles

**Goal:**  Write a function that scrapes the news articles for the following topics:

- Business


- Sports


- Technology


- Entertainment

In [3]:
url = 'https://inshorts.com/en/read/entertainment'

response = get(url)
response.ok

True

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

#### Scrape News Cards from Main Page

In [24]:
# Scrape a ResultSet of all the news cards on the page and look at first card

cards = soup.find_all('div', class_='news-card')
print(type(cards))
cards[0]

<class 'bs4.element.ResultSet'>


<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/prithviraj-shares-pic-of-transformation-after-having-dangerously-low-fat-percentage-1590492948306" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Daisy Mowke" itemprop="name"></span>
</span>
<span content="Prithviraj shares pic of transformation after having 'dangerously low fat percentage'" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2020/05_may/26_tue/img_1590491282323_173.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organizati

#### Scrape the Title from Each News Card

In [25]:
# Save the title of each news card to list titles

titles = []
for card in cards:
    title = card.find('span', itemprop='headline').text
    titles.append(title)
    
titles[:5]

["Prithviraj shares pic of transformation after having 'dangerously low fat percentage'",
 'Akshay Kumar resumes outdoor shooting amid lockdown; pics from set surface online',
 'Karan Johar confirms 2 house helps tested COVID-19 +ve, says he tested -ve',
 "Actress Preksha Mehta commits suicide at 25, wrote 'Death of dreams' in Insta story",
 "Nolan crashed a real plane into a real building in 'Tenet': Actor John Washington"]

#### Scrape Author from News Cards

In [26]:
# Save the author of the news card to list authors

authors = []
for card in cards:
    author = card.find('span', class_='author').text
    authors.append(author)
    
authors[:5]

['Daisy Mowke', 'Daisy Mowke', 'Daisy Mowke', 'Daisy Mowke', 'Daisy Mowke']

#### Scrape Text from News Cards

In [27]:
# Save the text of each article to a list of texts

texts = []
for card in cards:
    text = card.find('div', itemprop='articleBody').text
    texts.append(text)
    
texts[:2]

['South Indian actor Prithviraj Sukumaran today shared a picture of his physical transformation. "One month since we finished the last of...bare body scenes for \'Aadujeevitham\'. On the last day, I had dangerously low fat percentage and visceral fat levels," he wrote. Prithviraj, who was stranded in Jordan with the film crew for almost three months, returned to Kochi on Friday.',
 'Akshay Kumar has become the first Bollywood actor to shoot on outdoor location amid lockdown. He shot for a project with director R Balki. Several pictures and videos from the shoot have surfaced on social media in which the team, including Akshay and Balki, are seen wearing masks. They can also be seen maintaining social distancing.']

In [28]:
# Create an empty list, articles, to hold the dictionaries for each article
articles = []

# Loop through each news card on the page and get what we want
for card in cards:
    title = card.find('span', itemprop='headline' ).text
    author = card.find('span', class_='author').text
    content = card.find('div', itemprop='articleBody').text
    
    # Create a dictionary, article, for each news card
    article = {'title': title, 'author': author, 'content': content}
    
    # Add the dictionary, article, to our list of dictionaries, articles.
    articles.append(article)

In [29]:
# Here we see our list contains 24-25 dictionaries for news cards

print(len(articles))
articles[:2]

25


[{'title': "Prithviraj shares pic of transformation after having 'dangerously low fat percentage'",
  'author': 'Daisy Mowke',
  'content': 'South Indian actor Prithviraj Sukumaran today shared a picture of his physical transformation. "One month since we finished the last of...bare body scenes for \'Aadujeevitham\'. On the last day, I had dangerously low fat percentage and visceral fat levels," he wrote. Prithviraj, who was stranded in Jordan with the film crew for almost three months, returned to Kochi on Friday.'},
 {'title': 'Akshay Kumar resumes outdoor shooting amid lockdown; pics from set surface online',
  'author': 'Daisy Mowke',
  'content': 'Akshay Kumar has become the first Bollywood actor to shoot on outdoor location amid lockdown. He shot for a project with director R Balki. Several pictures and videos from the shoot have surfaced on social media in which the team, including Akshay and Balki, are seen wearing masks. They can also be seen maintaining social distancing.'}]

#### Build Article Function

In [30]:
def get_news_articles(cache=False):
    '''
    This function uses a cache parameter with default cache == False to give the option of 
    returning in a df of inshorts topics and info by reading a csv file or
    of doing a fresh scrape of inshort pages with topics business, sports, technology,
    and entertainment and writing the returned df to a csv file.
    '''
    # default to read in a csv instead of scrape for df
    if cache == False:
        df = pd.read_csv('articles.csv', index_col=0)
        
    # cache == True completes a fresh scrape for df    
    else:
    
        # Set base_url and headers that will be used in get request

        base_url = 'https://inshorts.com/en/read/'
        headers = {'User-Agent': 'Codeup Data Science'}
        
        # List of topics to scrape
        topics = ['business', 'sports', 'technology', 'entertainment']

        # Create an empty list, articles, to hold our dictionaries
        articles = []

        for topic in topics:

            # Get a response object from the main inshorts page
            response = get(base_url + topic, headers=headers)

            # Create soup object using response from inshort
            soup = BeautifulSoup(response.text, 'html.parser')

            # Scrape a ResultSet of all the news cards on the page
            cards = soup.find_all('div', class_='news-card')

            # Loop through each news card on the page and get what we want
            for card in cards:
                title = card.find('span', itemprop='headline' ).text
                author = card.find('span', class_='author').text
                content = card.find('div', itemprop='articleBody').text

                # Create a dictionary, article, for each news card
                article = ({'topic': topic, 
                            'title': title, 
                            'author': author, 
                            'content': content})

                # Add the dictionary, article, to our list of dictionaries, articles.
                articles.append(article)
            
        # Why not return it as a DataFrame?!
        df = pd.DataFrame(articles)
        
        # Write df to csv for future use
        df.to_csv('articles.csv')
    
    return df

In [31]:
# Test our function with cache == True to do a freash scrape and write to `articles.csv`

df = get_news_articles(cache=True)
df.head()

Unnamed: 0,topic,title,author,content
0,business,Firm whose stock surged 1000% in 2020 starts h...,Krishna Veera Vanamali,US biotech company Novavax said it has started...
1,business,India's economic growth seen at 1.2% in Q4 FY2...,Dharna,India's economy is estimated to have grown at ...
2,business,TVS Motor cuts employees' salaries by up to 20...,Dharna,TVS Motor Company has said it is cutting the s...
3,business,"Lockdown extensions won't help, cases will con...",Anushka Dixit,Mahindra Group Chairman Anand Mahindra said th...
4,business,Uber India fires 600 employees reducing 25% of...,Dharna,"Uber is firing 600 employees in India, or 25% ..."


In [32]:
df.topic.value_counts()

sports           25
business         25
entertainment    25
technology       24
Name: topic, dtype: int64

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


In [34]:
# Test our function to read in the df from `articles.csv`

df = get_news_articles(cache=False)
df.head()

Unnamed: 0,topic,title,author,content
0,business,Firm whose stock surged 1000% in 2020 starts h...,Krishna Veera Vanamali,US biotech company Novavax said it has started...
1,business,India's economic growth seen at 1.2% in Q4 FY2...,Dharna,India's economy is estimated to have grown at ...
2,business,TVS Motor cuts employees' salaries by up to 20...,Dharna,TVS Motor Company has said it is cutting the s...
3,business,"Lockdown extensions won't help, cases will con...",Anushka Dixit,Mahindra Group Chairman Anand Mahindra said th...
4,business,Uber India fires 600 employees reducing 25% of...,Dharna,"Uber is firing 600 employees in India, or 25% ..."


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.9+ KB
