In [1]:
import os
import sys
import pprint

import pandas as pd
import re
import json

from requests import get
from bs4 import BeautifulSoup

In [2]:
def get_news_article(url, category):
    """
    Takes a single complete url and returns a dictionary.
    """
    headers = {'User-Agent': 'Codeup Ada Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('title')
    article = soup.find('a', class_='clickable', attrs={'href': re.compile("^/en/news")})
    
#     d = dict()
#     d['title'] = title.text
#     d['category'] = category
#     d['content'] = article.text

    return article.text

In [3]:
get_news_article('https://inshorts.com/en/news/volkswagen-unit-porsche-fined-%E2%82%B94100-crore-over-diesel-scandal-1557251200422', 'business')

'\nVolkswagen unit Porsche fined ₹4,100 crore over diesel scandal\n'

In [4]:
def get_news_links(url, category):
    """
    Takes a url and returns a list of links on that url's page.
    """
    headers = {'User-Agent': 'Codeup Ada Data Science'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('title')
    links = []
    for link in soup.findAll('a', class_='clickable', attrs={'href': re.compile("^/en/news")}):
        links.append('https://inshorts.com' + link.get('href'))
    return links

In [5]:
get_news_links('https://inshorts.com/en/read/business', 'business')

['https://inshorts.com/en/news/nirav-modi-denied-bail-for-third-time-despite-him-offering-₹18-cr-1557331847600',
 'https://inshorts.com/en/news/volkswagen-unit-porsche-fined-₹4100-crore-over-diesel-scandal-1557251200422',
 'https://inshorts.com/en/news/indian-technician-towing-kuwait-airways-plane-crushed-to-death-1557285659996',
 'https://inshorts.com/en/news/trump-lost-$1-bn-over-a-decade-avoided-paying-taxes-for-8-yrs-nyt-1557323760724',
 'https://inshorts.com/en/news/us-warns-india-against-tariffs-over-scrapping-of-trade-benefits-1557245457108',
 'https://inshorts.com/en/news/lakshmi-mittal-hid-links-with-brothers-defaulting-firms-ruias-1557308216401',
 'https://inshorts.com/en/news/tata-steel-says-us-china-have-two-days-left-to-make-trade-deal-1557318352334',
 'https://inshorts.com/en/news/hamdard-pakistan-offers-to-help-after-rooh-afza-shortage-in-india-1557310538175',
 'https://inshorts.com/en/news/resigned-pledged-shares-provided-₹250-cr-to-banks-naresh-goyal-1557250273105',
 '

In [6]:
def traverse(o, tree_types=(list, tuple)):
    if isinstance(o, tree_types):
        for value in o:
            for subvalue in traverse(value, tree_types):
                yield subvalue
    else:
        yield o

In [7]:
# def get_news_texts(*categories):
#     news_links = []
#     for category in categories:
#         links = get_news_links(("https://inshorts.com/en/read/") + category, category)
#         news_links.append(links)
#     blog_texts = []
#     for value in traverse(news_links):
#         blog_texts.append(get_news_article(value), category)
#     return blog_texts

In [8]:
def get_news_texts(*categories):
    """
    """
    
    news_links = []
    for category in categories:
        links = get_news_links(("https://inshorts.com/en/read/") + category, category)
        for link in links:
            news_links.append({
                "link": link,
                "category": category,
                "content": get_news_article(link, category)
            }) 
            
    return news_links

In [9]:
# get_news_texts('business', 'sports')

In [10]:
corpus = get_news_texts('business',
                        'sports',
                        'technology',
                        'entertainment')

In [11]:
df = pd.DataFrame(corpus)
df

Unnamed: 0,category,content,link
0,business,\nNirav Modi denied bail for third time despit...,https://inshorts.com/en/news/nirav-modi-denied...
1,business,"\nVolkswagen unit Porsche fined ₹4,100 crore o...",https://inshorts.com/en/news/volkswagen-unit-p...
2,business,\nIndian technician towing Kuwait Airways plan...,https://inshorts.com/en/news/indian-technician...
3,business,"\nTrump lost $1 bn over a decade, avoided payi...",https://inshorts.com/en/news/trump-lost-$1-bn-...
4,business,\nUS warns India against tariffs over scrappin...,https://inshorts.com/en/news/us-warns-india-ag...
5,business,\nLakshmi Mittal hid links with brothers' defa...,https://inshorts.com/en/news/lakshmi-mittal-hi...
6,business,"\nTata Steel says US, China have two days left...",https://inshorts.com/en/news/tata-steel-says-u...
7,business,\nHamdard Pakistan offers to help after Rooh A...,https://inshorts.com/en/news/hamdard-pakistan-...
8,business,"\nResigned, pledged shares, provided ₹250 cr t...",https://inshorts.com/en/news/resigned-pledged-...
9,business,\n$40M worth bitcoins stolen from 4th largest ...,https://inshorts.com/en/news/$40m-worth-bitcoi...
