In [1]:
#!pip install feedparser
#!pip install newspaper3k

import feedparser as fp
import numpy as np
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv

In [2]:
# Set the limit for number of articles to download
LIMIT = 1000000000
articles_array = []

data = {}
data['newspapers'] = {}

# Loads the JSON files with news sites
with open('newspaper.json') as data_file:
    companies = json.load(data_file)

In [3]:
count = 1

# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data. RSS (Rich Site Summary; originally RDF Site Summary; often called Really Simple Syndication) is a type of
    # web feed which allows users to access updates to online content in a standardized, computer-readable format
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                article['authors'] = content.authors
                article['top_image'] =  content.top_image
                article['movies'] = content.movies
                newsPaper['articles'].append(article)
                articles_array.append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.

            article = {}
            article['title'] = content.title
            article['authors'] = content.authors
            article['text'] = content.text
            article['top_image'] =  content.top_image
            article['movies'] = content.movies
            article['link'] = content.url
            article['published'] = content.publish_date
            newsPaper['articles'].append(article)
            articles_array.append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            #noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

Building site for  taifaleo
1 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/news/1056-1056-u6geog/index.html
2 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/video/1951480-1951480-1072vroz/index.html
3 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/photo/1951220-1951220-df7whxz/index.html
4 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/news/politics/1064-1064-4f88toz/index.html
5 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/news/africa/1066-1066-oo1nedz/index.html
6 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/news/world/1068-1068-y0kl4cz/index.html
7 articles downloaded from taifaleo  using newspaper, url:  https://www.nation.co.ke/news/world/Germany-to-unveil-plan-to-tackle-global-warming/1068-5276056-wieror/index.html
8 articles downloaded from taifaleo  using news

55 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/life/how-police-bungled-case-of-carjacker-who-stole-former-ntv-employees-id
56 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/videos/senator-loitiptip-brushes-off-claims-of-impregnating-and-threating-a-woman-he-met-online
57 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/editors-picks/ugandan-speaker-warns-mps-against-engaging-in-reckless-sex
58 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/news/media-ejected-from-sh1-7-billion-city-hall-aar-medical-scheme-scandal-probe
59 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/news/transgender-activist-audrey-mbugua-gets-updated-kcse-certificate
60 articles downloaded from taifaleo  using newspaper, url:  https://nairobinews.nation.co.ke/life/primary-school-pupil-shocks-kenyans

8 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315455-mwanga-wa-kiswahili-utata-wa-vitatevitata.html
9 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315462-picha-za-msanii-victoria-kimani-zilizowaacha-wengi-wakimwaga-mate.html
10 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315457-polisi-wapata-ksh-1m-zilizoibiwa-zimefichwa-kwenye-zizi-la-ngombe.html
11 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315440-bilionea-chris-kirubi-awaomba-wakenya-kufanyiwa-uchunguzi-wa-mapema-wa-saratani.html
12 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315436-mwanaspoti-mahiri-afichua-kuwa-ana-virusi-vya-ukimwi.html
13 articles downloaded from tuko  using newspaper, url:  https://kiswahili.tuko.co.ke/315452-mwanamuziki-ray-c-afichua-kuwa-amepoteza-mimba-yake.html
14 articles downloaded from tuko  using newspaper, url:

In [4]:
#Finally it saves the articles as a CSV-file.
try:
    f = csv.writer(open('Swahilinews.csv', 'w', encoding='utf-8'))
    f.writerow(['Title', 'Authors','Text','Image','Videos','Link','Published_Date'])
    #print(article)
    for artist_name in articles_array:
        title = artist_name['title']
        authors=artist_name['authors']
        text=artist_name['text']
        image=artist_name['top_image']
        video=artist_name['movies']
        link=artist_name['link']
        publish_date=artist_name['published']
        # Add each artist’s name and associated link to a row
        f.writerow([title, authors, text, image, video, link, publish_date])
except Exception as e: print(e)