In [1]:
import pandas as pd
import numpy as np

from requests import get
import re
from bs4 import BeautifulSoup
import os

### Codeup Blogs

**Goals:** Write a function to scrape urls from main Codeup blog web page and write a function that returns a dictionary of blog titles and text for each blog page. 

#### Grab Title from Page

Here I use the `.find()` method on my soup with the `<h1>` tag. As always, there is no one way to accomplish our task, so I'm demonstrating one way to scrape the headline, not THE way to scrape the headline. 

In [2]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'} 
    
response = get(url, headers=headers)
response.ok

True

In [3]:
# Here's our long string of HTML; we'll use response.text to make our soup object.

print(type(response.text))

<class 'str'>


In [4]:
# Create our Soup object by passing our HTML string and choice of parser.

soup = BeautifulSoup(response.text, 'html.parser')

# Now we have our BeautifulSoup object and can use its built-in methods and attributes.

print(type(soup))

<class 'bs4.BeautifulSoup'>


In [5]:
# The h1 element holds my title.

title = soup.find('h1').text
title

'Codeup’s Data Science Career Accelerator is Here!'

#### Grab Text from Page

In [6]:
content = soup.find('div', class_="jupiterx-post-content").text
print(content)

The rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in Glassdoor’s #1 Best Job in America.
Data Science is a method of providing actionable intelligence from data. The data revolution has hit San Antonio, resulting in an explosion in Data Scientist positions across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen UTSA invest $70 M for a Cybersecurity Center and School of Data Science. We built a program to specifically meet the growing demands of this industry.
Our program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Students will work with real

In [7]:
print(type(content))

<class 'str'>


#### Build Blog Function

In [8]:
# Create helper function that requests and parses HTML returning soup object.

def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Codeup Data Science'} 
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [9]:
def get_blog_articles(urls, cached=False):
    '''
    This function takes in a list of Codeup Blog urls and a parameter
    with default cached == False which scrapes the title and text for each url, 
    creates a list of dictionaries with the title and text for each blog, 
    converts list to df, and returns df.
    If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('big_blogs.json')
        
    # cached == False completes a fresh scrape for df     
    else:
        headers = {'User-Agent': 'Codeup Bayes Data Science'} 

        # Create an empty list to hold dictionaries
        articles = []

        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)

            # Save the title of each blog in variable title
            title = soup.find('h1').text

            # Save the text in each blog to variable text
            content = soup.find('div', class_="jupiterx-post-content").text

            # Create a dictionary holding the title and content for each blog
            article = {'title': title, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('big_blogs.json')
    
    return df

#### Test Function

In [10]:
# Here cached == False, so the function will do a fresh scrape of the urls and write data to a json file.

urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/',
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

blogs = get_blog_articles(urls=urls, cached=False)
blogs

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


#### Bonus URL Scrape

In [11]:
# I'm going to hit Codeup's main blog page to scrape the urls

url = 'https://codeup.com/resources/#blog'
headers = {'User-Agent': 'Codeup Data Science'} 

# Request the HTML
response = get(url, headers=headers)

# Create the soup object to parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
# I'm filtering my soup to return a list of all anchor elements from my HTML.

urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
urls_list

[<a class="jet-listing-dynamic-link__link" href="https://codeup.com/introducing-salary-refund-guarantee/"><span class="jet-listing-dynamic-link__label">Introducing Our Salary Refund Guarantee</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/introducing-salary-refund-guarantee/"><span class="jet-listing-dynamic-link__label">Read More</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/new-scholarship/"><span class="jet-listing-dynamic-link__label">Announcing: The Annie Easley Scholarship to Support the Black Community in Tech</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/new-scholarship/"><span class="jet-listing-dynamic-link__label">Read More</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/codeup-in-houston/"><span class="jet-listing-dynamic-link__label">Codeup Launches Houston!</span></a>,
 <a class="jet-listing-dynamic-link__link" href="https://codeup.com/codeu

In [13]:
# Filter the href attribute value for each anchor element in my list; we scraped 40 urls.

# I'm using a set comprehension to return only unique urls because there are two links for each article.
urls = {link.get('href') for link in urls_list}

# I'm converting my set to a list of urls.
urls = list(urls)

print(f'There are {len(urls)} unique links in our urls list.')
print()
urls 

There are 20 unique links in our urls list.



['https://codeup.com/how-were-celebrating-world-mental-health-day-from-home/',
 'https://codeup.com/journey-into-web-development/',
 'https://codeup.com/build-your-career-in-tech/',
 'https://codeup.com/codeup-alumni-make-water/',
 'https://codeup.com/codeup-wins-civtech-datathon/',
 'https://codeup.com/what-is-machine-learning/',
 'https://codeup.com/education-is-an-investment/',
 'https://codeup.com/introducing-salary-refund-guarantee/',
 'https://codeup.com/transition-into-data-science/',
 'https://codeup.com/codeup-inc-5000/',
 'https://codeup.com/succeed-in-a-coding-bootcamp/',
 'https://codeup.com/what-is-python/',
 'https://codeup.com/what-data-science-career-is-for-you/',
 'https://codeup.com/codeup-in-houston/',
 'https://codeup.com/what-to-expect-at-codeup/',
 'https://codeup.com/from-slacker-to-data-scientist/',
 'https://codeup.com/codeups-application-process/',
 'https://codeup.com/math-in-data-science/',
 'https://codeup.com/new-scholarship/',
 'https://codeup.com/covid-1

#### Bonus URL Function

In [15]:
def get_all_urls():
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    # The base url for the main Codeup blog page
    url = 'https://codeup.com/resources/#blog'
    headers = {'User-Agent': 'Codeup Data Science'} 
    
    # Make request and soup object using helper
    soup = make_soup(url)
    
    # Create a list of the anchor elements that hold the urls.
    urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
    
    # I'm using a set comprehension to return only unique urls because list contains duplicate urls.
    urls = {link.get('href') for link in urls_list}

    # I'm converting my set to a list of urls.
    urls = list(urls)
        
    return urls

In [16]:
# Now I can use my same function with my new function.
# cached == False does a fresh scrape.

big_blogs = get_blog_articles(urls=get_all_urls(), cached=False)

In [17]:
big_blogs.head(10)

Unnamed: 0,title,content
0,How We’re Celebrating World Mental Health Day ...,World Mental Health Day is on October 10th. Al...
1,Alumni Share their Journey into Web Development,Everyone starts somewhere. Many developers out...
2,Build Your Career in Tech: Advice from Alumni!,"Bryan Walsh, Codeup Web Development alum, and ..."
3,How Codeup Alumni are Helping to Make Water,Imagine having a kit mailed to you with all th...
4,Codeup Grads Win CivTech Datathon,Many Codeup alumni enjoy competing in hackatho...
5,What is Machine Learning?,"There’s a lot we can learn about machines, and..."
6,Your Education is an Investment,You have many options regarding educational ro...
7,Introducing Our Salary Refund Guarantee,"Here at Codeup, we believe it’s time to revolu..."
8,What is the Transition into Data Science Like?,Alumni Katy Salts and Brandi Reger joined us a...
9,Codeup on Inc. 5000 Fastest Growing Private Co...,We’re excited to announce a huge Codeup achiev...


In [18]:
big_blogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    20 non-null     object
 1   content  20 non-null     object
dtypes: object(2)
memory usage: 448.0+ bytes


In [19]:
# cached == True reads in a df from `big_blogs.json`.

big_blogs = get_blog_articles(urls=get_all_urls(), cached=True)
big_blogs.head()

Unnamed: 0,title,content
0,How We’re Celebrating World Mental Health Day ...,World Mental Health Day is on October 10th. Al...
1,Alumni Share their Journey into Web Development,Everyone starts somewhere. Many developers out...
2,Build Your Career in Tech: Advice from Alumni!,"Bryan Walsh, Codeup Web Development alum, and ..."
3,How Codeup Alumni are Helping to Make Water,Imagine having a kit mailed to you with all th...
4,Codeup Grads Win CivTech Datathon,Many Codeup alumni enjoy competing in hackatho...


___

### Inshorts News Articles

**Goal:**  Write a function that scrapes the news articles for the following topics:

- Business


- Sports


- Technology


- Entertainment

In [20]:
url = 'https://inshorts.com/en/read/entertainment'

response = get(url)
response.ok

True

In [21]:
soup = BeautifulSoup(response.text, 'html.parser')

#### Scrape News Cards from Main Page

In [22]:
# Scrape a ResultSet of all the news cards on the page and inspect the elements on the first card.

cards = soup.find_all('div', class_='news-card')

print(f'There are {len(cards)} news cards on this page.')
print()
cards[0]

There are 25 news cards on this page.



<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/nazia-nasim-becomes-the-first-crorepati-of-kbc-season-12-1605077219002" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Roshan Gupta" itemprop="name"></span>
</span>
<span content="Nazia Nasim becomes the first 'Crorepati' of KBC Season 12" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2020/11_nov/11_wed/img_1605072990016_282.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organization">
<span content="https://inshorts.com/" itemprop="

#### Scrape the Title from Each News Card

In [23]:
# Create a list of titles using the span element and itemprop attribute with text method.

titles = [card.find('span', itemprop='headline').text for card in cards]
titles[:5]

["Nazia Nasim becomes the first 'Crorepati' of KBC Season 12",
 "Should've done my research: Cardi B on pic of her as Goddess Durga",
 'A spiritual leader tried to take advantage of me at 18: Actress Anupria',
 'Tamil Nadu theatres reopen after 8 months amid COVID-19 pandemic',
 'COVID-19 medication made me look heavy, trolls called me fat: Tamannaah']

#### Scrape Author from News Cards

In [24]:
# Create a list of authors using the span element and class attribute with text method.

authors = [card.find('span', class_='author').text for card in cards]
authors[:5]

['Roshan Gupta', 'Daisy Mowke', 'Anmol Sharma', 'Atul Mishra', 'Daisy Mowke']

#### Scrape Text from News Cards

In [25]:
# Create a list of content strings using the div element and itemprop attribute with text method.

content = [card.find('div', itemprop='articleBody').text for card in cards]
content[:5]

['Celebrating the win of Nazia Nasim, native of Jharkhand and a Delhi resident, KBC will witness the first \'Crorepati\' of this season. Amidst a round of applause for Nazia, host Amitabh Bachchan said, "What an incredible game you have played. Jahan aapki nazar gayee woh humesha sahee nikla. (You (Nazia) always managed to pick the right choice)."',
 'Cardi B has apologised after criticism over a "bare-bodied" picture in which she was posing as Goddess Durga to promote her sneaker collection. "The creatives told me I was going to represent a Goddess who represents strength, femininity and liberation, and that\'s something I love," she stated. "I don\'t like disrespecting anyone\'s religion...I should\'ve done my research," the rapper added.',
 'Bollywood actress Anupria Goenka has revealed that a spiritual leader tried to take advantage of her when she was 18 years old. She added that her family trusted him heavily and even she had started to believe in him. "[The incident] scarred me 

In [26]:
# Create an empty list, articles, to hold the dictionaries for each article.
articles = []

# Loop through each news card on the page and get what we want
for card in cards:
    title = card.find('span', itemprop='headline' ).text
    author = card.find('span', class_='author').text
    content = card.find('div', itemprop='articleBody').text
    
    # Create a dictionary, article, for each news card
    article = {'title': title, 'author': author, 'content': content}
    
    # Add the dictionary, article, to our list of dictionaries, articles.
    articles.append(article)

In [27]:
# Here we see our list contains 24-25 dictionaries for news cards

print(len(articles))
articles[0]

25


{'title': "Nazia Nasim becomes the first 'Crorepati' of KBC Season 12",
 'author': 'Roshan Gupta',
 'content': 'Celebrating the win of Nazia Nasim, native of Jharkhand and a Delhi resident, KBC will witness the first \'Crorepati\' of this season. Amidst a round of applause for Nazia, host Amitabh Bachchan said, "What an incredible game you have played. Jahan aapki nazar gayee woh humesha sahee nikla. (You (Nazia) always managed to pick the right choice)."'}

#### Build Article Function

In [29]:
def get_news_articles(cached=False):
    '''
    This function with default cached == False does a fresh scrape of inshort pages with topics 
    business, sports, technology, and entertainment and writes the returned df to a json file.
    cached == True returns a df read in from a json file.
    '''
    # option to read in a json file instead of scrape for df
    if cached == True:
        df = pd.read_json('articles.json')
        
    # cached == False completes a fresh scrape for df    
    else:
    
        # Set base_url and headers that will be used in get request
        base_url = 'https://inshorts.com/en/read/'
        headers = {'User-Agent': 'Codeup Data Science'}
        
        # List of topics to scrape
        topics = ['business', 'sports', 'technology', 'entertainment']
        
        # Create an empty list, articles, to hold our dictionaries
        articles = []

        for topic in topics:
            
            # Create url with topic endpoint
            topic_url = base_url + topic
            
            # Make request and soup object using helper
            soup = make_soup(topic_url)

            # Scrape a ResultSet of all the news cards on the page
            cards = soup.find_all('div', class_='news-card')

            # Loop through each news card on the page and get what we want
            for card in cards:
                title = card.find('span', itemprop='headline' ).text
                author = card.find('span', class_='author').text
                content = card.find('div', itemprop='articleBody').text

                # Create a dictionary, article, for each news card
                article = ({'topic': topic, 
                            'title': title, 
                            'author': author, 
                            'content': content})

                # Add the dictionary, article, to our list of dictionaries, articles.
                articles.append(article)
            
        # Create a DataFrame from list of dictionaries
        df = pd.DataFrame(articles)
        
        # Write df to csv for future use
        df.to_json('articles.json')
    
    return df

In [30]:
# Test our function with cached == False to do a freash scrape and create `articles.json` file.

df = get_news_articles(cached=False)
df.head()

Unnamed: 0,topic,title,author,content
0,business,China COVID-19 vaccine trial halted in Brazil ...,Ankush Verma,The final-stage trial of China's Sinovac COVID...
1,business,Chinese tech firms lose $290B in market value ...,Pragya Swastik,China's biggest technology companies like Alib...
2,business,Amazon accuses Future Retail of insider tradin...,Pragya Swastik,Amazon has asked SEBI to investigate Future Re...
3,business,Ensure all accounts are linked with Aadhaar by...,Pragya Swastik,Finance Minister Nirmala Sitharaman on Tuesday...
4,business,Cabinet approves PLI scheme worth ₹1.46 lakh c...,Krishna Veera Vanamali,The Union Cabinet on Wednesday approved Produc...


In [31]:
df.topic.value_counts()

entertainment    25
technology       25
sports           25
business         24
Name: topic, dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


In [33]:
# Test our function to read in the df from `articles.csv`

df = get_news_articles(cached=True)
df.head()

Unnamed: 0,topic,title,author,content
0,business,China COVID-19 vaccine trial halted in Brazil ...,Ankush Verma,The final-stage trial of China's Sinovac COVID...
1,business,Chinese tech firms lose $290B in market value ...,Pragya Swastik,China's biggest technology companies like Alib...
2,business,Amazon accuses Future Retail of insider tradin...,Pragya Swastik,Amazon has asked SEBI to investigate Future Re...
3,business,Ensure all accounts are linked with Aadhaar by...,Pragya Swastik,Finance Minister Nirmala Sitharaman on Tuesday...
4,business,Cabinet approves PLI scheme worth ₹1.46 lakh c...,Krishna Veera Vanamali,The Union Cabinet on Wednesday approved Produc...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    99 non-null     object
 1   title    99 non-null     object
 2   author   99 non-null     object
 3   content  99 non-null     object
dtypes: object(4)
memory usage: 3.9+ KB
