In [5]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import acquire as a

#import
import warnings
warnings.filterwarnings("ignore")

# Exercises

By the end of this exercise, you should have a file named acquire.py that contains the specified functions. If you wish, you may break your work into separate files for each website (e.g. acquire_codeup_blog.py and acquire_news_articles.py), but the end function should be present in acquire.py (that is, acquire.py should import get_blog_articles from the acquire_codeup_blog module.)

### 1. Codeup Blog Articles

Scrape the article text from the following pages:

- https://codeup.com/codeups-data-science-career-accelerator-is-here/
- https://codeup.com/data-science-myths/
- https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
- https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
- https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

```
{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}


```

## Problem 1: Get some blog articles from a few addresses, put them in a function called get_blog_articles()
### Approach: Do the thing once, do it in a loop, do the loop in a function

###Tools we will use:

1. Browser inspector (command + option + i), right-click inspect element, right-click copy css selector
2. Beautiful Soup to Parse page

### Find and select tips:
- To find elements with the class="link", <a href="codeup.com" class="link">Codeup.com</a>, we use:
    
    - If you're using soup.find, you can use "_class=link" in your arguments
    - If we use soup.select, we can put .link in soup.select(".link")
- Some other CSS selectors to use with .select:

    - .class_name
    - #id_name, and IDs are unique to a page
    - tag selectors. If we do soup.select("a"), we'll get back a list of all the anchor tags
- main, header, footer, section, article, div are generic containers for content. These are boxes of content
- anchor tag, strong, or a span is an in-line chunk of content

### Build upward in complexity!
- Do our desired thing once, and put that in a function
- Use that function in a loop
- Put that loop in a function

In [14]:
#get the url
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here'

In [19]:
# # Some websites don't accept the pyhon-requests default user-agent
headers = {'User-Agent': 'Codeup Data Science Student'} 
#or                  (check robots.txt)
#headers = {"User-Agent": "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"}

In [20]:
#get response object from the server

response = get(url, headers=headers)

In [21]:
#to check response 
print (response.text [:500])

<!DOCTYPE html><html lang="en-US"><head >	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1' />
<style type="text/css" id="nab-alternative-loader-style"></style>
<script type="text/javascript" id="nelio-ab-testing-kickoff">/*nelio-ab-testing-kickoff*//* <![CDATA[ */( function() { var ua = window.navigator.userAgent || ''; if ( -1 !== ua.indexOf


In [22]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text)


###  soup.find to find one thing
###  soup.find_all to find all the matching things
###  soup.select to find all the matching things (as a list of tags)

In [None]:
#get the title ( go to the url page and do inspect)
article_title = soup.find('h1', class_='jupiterx-post-title').text
#other way
# #h1 on its own works here, but not necessisarily everywhere. Pages can have > 1 h1 tag
#title = soup.find('h1')

article_title

In [None]:
# # If we wanted to be more specific
# # Give me the h1 that also has the jupiterx-post-title class
title = soup.select('.jupiterx-post-title')[0].text

In [None]:
title

In [None]:
# get the content
article = soup.find('div', class_='jupiterx-post-content').get_text(strip =True)
article

In [None]:
#other way

content2 = soup.select('.jupiterx-post-content')[0].text

In [None]:
content2

#### Each piece of soup we access is another soup object with the same methods and properties available
-  soup.element.text
-  soup.time.text

In [None]:
# soup.element["attribute_name"]
# If you have an attribute name and need that attribute's value, then we use dictionary syntax
# soup.time["datetime"]

In [None]:
# # soup.select("img") is wayyy to broad, since it returns every image on the page
# # so we need to get to know our data, our html structure
# # Let's get more specific
#div_for_image = soup.select('.jupiterx-post-image')

In [None]:
# image_src = div_for_image.picture.img['data-src']
# image_src

In [None]:
def get_article_parts (url,tag, cl  ):
    headers = {'User-Agent': 'Codeup Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text)
    article = soup.find(tag, class_=cl).get_text(strip = True)

    return article

In [None]:
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
content =get_article_parts(url, 'div', 'jupiterx-post-content')

In [None]:
content

In [None]:
title = get_article_parts (url,'h1', 'jupiterx-post-title'  )

In [None]:
title

In [None]:
#other way to do it
soup.title.string

## create a function

In [None]:
def get_codeup_blog (url):
    '''
    '''
    #set the headers
    headers = {'User-Agent': 'Codeup Data Science'}
    
    #Get rhe http response object from the server
    response = get(url, headers=headers)
    
    soup = BeautifulSoup(response.text)
    
    # get the content
    content = soup.find('div', class_='jupiterx-post-content').get_text(strip = True)
    
    #get the title
    title = soup.find('h1', class_='jupiterx-post-title').text

    #get the date
    published_date = soup.time.text
    
    #create a dictionary
    dic = {
        'title': title,
        'content': content,
        'published_date': published_date
    }
    return dic

In [None]:
one = get_codeup_blog('https://codeup.com/codeups-data-science-career-accelerator-is-here')

In [None]:
one

In [None]:

web_list = ['https://codeup.com/codeups-data-science-career-accelerator-is-here', 'https://codeup.com/data-science-myths',
            'https://codeup.com/data-science-vs-data-analytics-whats-the-difference',
            'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair',
            'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger']

In [None]:
def get_glob_articles2 (web_list):   
    list_dic=[]
    for web in web_list:
        headers = {'User-Agent': 'Codeup Data Science'}
        response = get(web, headers=headers)
        soup = BeautifulSoup(response.text)
        # get the content
        content = soup.find('div', class_='jupiterx-post-content').get_text(strip = True)
        #get the title
        title = get_article_parts (url,'h1', 'jupiterx-post-title'  )
        
        #get the date
        published_date = soup.time.text
        #create a dictionary
        dic = {
            'title': title,
            'content': content,
            'published_date': published_date
        }
        #append to a list
        list_dic.append(dic)
    return list_dic

In [None]:
def get_glob_articles (urls):
    #create a list of dictionaries
    list_dic = [get_codeup_blog(url) for url in urls]
    
    return pd.DataFrame(list_dic)

In [None]:
list_articles = get_glob_articles(web_list)

In [None]:
list_articles



Plus any additional properties you think might be helpful.

**Bonus:**

Scrape the text of all the articles linked on codeup's blog page.

In [None]:
# test my functions


In [6]:
web_list = ['https://codeup.com/codeups-data-science-career-accelerator-is-here', 'https://codeup.com/data-science-myths',
            'https://codeup.com/data-science-vs-data-analytics-whats-the-difference',
            'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair',
            'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger']

In [7]:
list_articles = a.get_glob_articles(web_list)

In [8]:
list_articles

Unnamed: 0,title,content,published_date
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,"September 30, 2018"
1,Data Science Myths,By Dimitri Antoniou and Maggie GiustData Scien...,"October 31, 2018"
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri AntoniouA week ago, Codeuplaunched ...","October 17, 2018"
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job FairThe third bi-annualSan Antonio...,"August 14, 2018"
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,"August 14, 2018"


### 2. News Articles

We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

https://inshorts.com/en/read

Write a function that scrapes the news articles for the following topics:

- Business
- Sports
- Technology
- Entertainment

The end product of this should be a function named get_news_articles that returns a list of dictionaries, where each dictionary has this shape:

```    
{
    'title': 'The article title',
    'content': 'The article content',
    'category': 'business' # for example
}

```

**Hints:**

- Start by inspecting the website in your browser. Figure out which elements will be useful.
- Start by creating a function that handles a single article and produces a dictionary like the one above.
- Next create a function that will find all the articles on a single page and call the function you created in the last step for every article on the page.
- Now create a function that will use the previous two functions to scrape the articles from all the pages that you need, and do any additional processing that needs to be done.

- Start by inspecting the website in your browser. Figure out which elements will be useful.

this is the url the main  https://inshorts.com/en/read if you want to check articles from:
 - Business  url = https://inshorts.com/en/read/business
 - Sports url = https://inshorts.com/en/read/sports
 - Technology url = https://inshorts.com/en/read/technology
 - Entretainment url = https://inshorts.com/en/read/entretainment
 

In [23]:
categories = ['business' , 'bports' , 'technology', 'entertainment', 'science', 'world']

In [24]:
base_url = ' https://inshorts.com/en/read/'

In [25]:
first_page = base_url + categories[0]

In [26]:
first_page

' https://inshorts.com/en/read/business'

In [27]:
headers

{'User-Agent': 'Codeup Data Science Student'}

In [35]:
#get the content

response = get(first_page, headers=headers)

In [36]:
response.text[:400]

'<!doctype html>\n<html lang="en">\n\n<head>\n  <meta charset="utf-8" />\n  <style>\n    /* The Modal (background) */\n    .modal_contact {\n        display: none; /* Hidden by default */\n        position: fixed; /* Stay in place */\n        z-index: 8; /* Sit on top */\n        left: 0;\n        top: 0;\n        width: 100%; /* Full width */\n        height: 100%;\n        overflow: auto; /* Enable scroll if ne'

In [37]:
#create soup object
soup = BeautifulSoup(response.text)

In [39]:
articles = soup.select (".news-card")

In [42]:
articles[0]

<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/amazon-job-posting-fuels-speculations-about-plan-to-accept-payments-in-crypto-1627312165039" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Pragya Swastik" itemprop="name"></span>
</span>
<span content="Amazon job posting fuels speculations about plan to accept payments in crypto" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2021/07_jul/26_mon/img_1627309467319_923.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organization">
<span 

In [43]:
articles[0].select("[itemprop='articleBody']")[0]

<div itemprop="articleBody">A new job posting by Amazon has fuelled speculations that the e-commerce major may begin accepting Bitcoin, Ether and other cryptocurrencies as a form of payment. According to the job posting, Amazon's Payments Acceptance &amp; Experience team is hiring a 'Digital Currency and Blockchain Product Lead'. Following the speculations around Amazon's plan, Bitcoin surged near $40,000 on Monday.</div>

In [47]:
#create a function ther get a single article
def get_article(article, category):
    # Attribute selector
    title = article.select("[itemprop='headline']")[0].text
    
    # article body
    content = article.select("[itemprop='articleBody']")[0].text
    
    #create a dictionary
    output = {}
    #save info 
    output["title"] = title
    output["content"] = content
    output["category"] = category
    
    return output


In [48]:
#create a function to get all the articles
def get_articles(category, base ="https://inshorts.com/en/read/"):
    """
    This function takes in a category as a string. Category must be an available category in inshorts
    Returns a list of dictionaries where each dictionary represents a single inshort article
    """
    
    # We concatenate our base_url with the category
    url = base + category
    
    # Set the headers
    headers = {"User-Agent": "Codeup Data Scient student"}

    # Get the http response object from the server
    response = get(url, headers=headers)

    # Make soup out of the raw html
    soup = BeautifulSoup(response.text)
    
    # Ignore everything, focusing only on the news cards
    articles = soup.select(".news-card")
    
    output = []
    
    # Iterate through every article tag/soup 
    for article in articles:
        
        # Returns a dictionary of the article's title, body, and category
        article_data = get_article(article, category) 
        
        # Append the dictionary to the list
        output.append(article_data)
    
    # Return the list of dictionaries
    return output


In [49]:
# Example of using the get_articles function sending in the category name that's part of the URL
# get_articles("business")

In [50]:
def get_all_news_articles(categories):
    """
    Takes in a list of categories where the category is part of the URL pattern on inshorts
    Returns a dataframe of every article from every category listed
    Each row in the dataframe is a single article
    """
    all_inshorts = []

    for category in categories:
        all_category_articles = get_articles(category)
        all_inshorts = all_inshorts + all_category_articles

    df = pd.DataFrame(all_inshorts)
    return df

In [51]:

categories = ["business", "sports", "technology", "entertainment", "science", "world"]
df = get_all_news_articles(categories)

In [55]:
df

Unnamed: 0,title,content,category
0,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business
1,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,business
4,"Unemployment rate rises in both urban, rural a...",India's unemployment rate soared to 7.14% in t...,business
...,...,...,...
142,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world
143,Russian PM Mishustin visits Pacific islands cl...,During his tour of Russia's Far East and Siber...,world
144,Ugandan govt spends $30 mn on cars for lawmake...,The Ugandan government was criticised after it...,world
145,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,world


### 3. Bonus: cache the data

Write your code such that the acquired data is saved locally in some form or fashion. Your functions that retrieve the data should prefer to read the local data instead of having to make all the requests everytime the function is called. Include a boolean flag in the functions to allow the data to be acquired "fresh" from the actual sources (re-writing your local cache).