In [1]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import numpy

In [24]:
# Set the limit for number of articles to download
LIMIT = 100

data = {}
data['newspapers'] = {}

In [25]:
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

In [26]:
companies

{'cnn': {'link': 'https://www.cnnindonesia.com/'},
 'detik': {'link': 'https://www.detik.com/'},
 'kompas': {'link': 'https://www.kompas.com/'},
 'liputan 6': {'link': 'https://www.liputan6.co/'},
 'kapanlagi': {'link': 'https://www.kapanlagi.com/'},
 'merdeka': {'link': 'https://www.merdeka.com/'},
 'tempo': {'link': 'https://www.tempo.co/'}}

In [27]:
dff = pd.DataFrame.from_dict(companies)
dff.info()
dff.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, link to link
Data columns (total 7 columns):
cnn          1 non-null object
detik        1 non-null object
kompas       1 non-null object
liputan 6    1 non-null object
kapanlagi    1 non-null object
merdeka      1 non-null object
tempo        1 non-null object
dtypes: object(7)
memory usage: 64.0+ bytes


Unnamed: 0,cnn,detik,kompas,liputan 6,kapanlagi,merdeka,tempo
link,https://www.cnnindonesia.com/,https://www.detik.com/,https://www.kompas.com/,https://www.liputan6.co/,https://www.kapanlagi.com/,https://www.merdeka.com/,https://www.tempo.co/


In [28]:
# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                    content.nlp()
                except Exception as e:
                    # Iflen() the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                article['keywords']=stopword.remove(' '.join(content.keywords))
                article['summary']=content.summary
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
                content.nlp()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 10:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            article['keywords']=stopword.remove(' '.join(content.keywords))
            article['summary']=content.summary
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

Building site for  cnn
1 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/nasional/20200909143216-20-544456/update-corona-9-september-203342-positif-145200-sembuh
2 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/nasional/20200909150125-20-544480/55-rumah-sakit-rujukan-covid-19-di-bali-hampir-penuh
3 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/nasional/20200909143403-20-544460/pemkot-surabaya-minta-maaf-ada-logo-pdip-di-program-tv-guruku
4 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/internasional/20200909142950-113-544452/bom-bunuh-diri-sasar-wapres-afghanistan-10-warga-tewas
5 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/internasional/20200909132730-113-544410/kim-jong-un-desak-pemulihan-daerah-diterjang-topan-maysak
6 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/internas

48 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/gaya-hidup/20200908132622-277-543969/tanda-tanaman-hias-terlalu-banyak-disiram
49 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/gaya-hidup/20200907115853-277-543556/5-tanaman-berdaun-hitam-elegan-untuk-dekorasi-rumah
50 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/gaya-hidup/20200902161939-277-542053/tanaman-hias-termahal-rekor-di-selandia-baru-harga-rp77-juta
51 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/hiburan/20200724162414-225-528645/encounter-dan-drama-korea-yang-dibintangi-park-bo-gum
52 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/hiburan/20200831132117-220-541124/7-drama-korea-terbaru-tayang-september-2020
53 articles downloaded from cnn  using newspaper, url:  https://www.cnnindonesia.com/hiburan/20200724100012-225-528457/city-hunter-dan-drama-ko

34 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/berita/d-5164320/kadishub-dki-kendaraan-listrik-jangan-anget-anget-tahi-ayam-seperti-cng
35 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/berita/d-5163216/tertarik-ganti-mobil-hybrid-daftar-harganya-mulai-rp-400-jutaan
36 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/berita/d-5156666/kalau-sepeda-listrik-dan-mobil-listrik-dikawinkan-begini-jadinya
37 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/berita/d-5144596/royal-enfield-siap-ikutan-jual-motor-listrik
38 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/oto-galeri/d-5165600/modifikasi-imut-imut-honda-super-cub
39 articles downloaded from detik  using newspaper, url:  https://oto.detik.com/oto-galeri/d-5164935/pajak-parkir-dki-jakarta-naik-jadi-30-di-tengah-pandemi
40 articles downloaded from detik  using newspaper, url:  https://oto.d

86 articles downloaded from detik  using newspaper, url:  https://hot.detik.com/celeb/d-5165848/ada-doa-abdul-somad-di-balik-rx-king-ade-jigo-ditukar-mobil-mewah
87 articles downloaded from detik  using newspaper, url:  https://wolipop.detik.com/home/d-5124760/hemat-beli-berbagai-bahan-pembersih-rumah-di-e-catalogue-transmart
88 articles downloaded from detik  using newspaper, url:  https://news.detik.com/berita/d-5116406/buat-momen-lebaran-lebih-seru-dengan-belanja-di-e-catalogue-transmart
89 articles downloaded from detik  using newspaper, url:  https://health.detik.com/berita-detikhealth/d-5049596/bunuh-virus-dari-luar-rumah-dengan-cuci-pakaian-hingga-bersih
90 articles downloaded from detik  using newspaper, url:  https://wolipop.detik.com/sale-and-shop/d-5049588/new-normal-3-barang-ini-wajib-dibawa-saat-kembali-bekerja
91 articles downloaded from detik  using newspaper, url:  https://health.detik.com/berita-detikhealth/d-5049576/hand-sanitizer-gel--spray-mana-yang-lebih-ampuh-cega

37 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/27/183200428/meikarta-solusi-mengurangi-beban-jakarta-dan-bandung
38 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/26/201700028/inilah-pilar-green-sustainable-living-meikarta-
39 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/27/181258328/meikarta-terobosan-lippo-di-tengah-mahal-dan-sulitnya-lahan-hunian
40 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/26/191200528/berkat-business-and-commercial-hub-meikarta-jadi-area-bisnis-baru
41 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/26/175850128/silicon-valley-ala-meikarta-jadi-kekuatan-cbd-baru-di-cikarang
42 articles downloaded from kompas  using newspaper, url:  https://sorot.kompas.com/meikarta/read/2017/11/25

83 articles downloaded from kompas  using newspaper, url:  https://foto.kompas.com/photo/read/2020/9/9/15996281775f5/1/krisis-lahan-pemakaman-covid-19-di-tpu-pondok-ranggon
84 articles downloaded from kompas  using newspaper, url:  https://foto.kompas.com/photo/read/2020/9/7/15994735082ac/1/cegah-penularan-rabies-hewan-peliharaan-di-jakarta-divaksin
85 articles downloaded from kompas  using newspaper, url:  https://foto.kompas.com/video/read/2020/9/4/84e615992361670f92b453/belajar-dari-rumah-meja-melayang-kok-bisa-tensegrity-bagian-2
86 articles downloaded from kompas  using newspaper, url:  https://foto.kompas.com/video/read/2020/8/31/1598864224bd7/sosialisasi-bahaya-covid-19-petugas-keliling-bawa-peti-jenazah
87 articles downloaded from kompas  using newspaper, url:  https://foto.kompas.com/video/read/2018/11/30/3a3d154356065976a59508/jaga-ketahanan-energi-pemerintah-optimalkan-eksplorasi-hulu-migas
88  Article has date of type None...
89 articles downloaded from kompas  using newspa

Traceback (most recent call last):
  File "/root/anaconda3/lib/python3.7/encodings/idna.py", line 165, in encode
    raise UnicodeError("label empty or too long")
UnicodeError: label empty or too long

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/anaconda3/lib/python3.7/site-packages/newspaper/mthreading.py", line 46, in run
    func(*args, **kargs)
  File "/root/anaconda3/lib/python3.7/site-packages/newspaper/network.py", line 108, in send
    self.timeout, self.useragent, self.proxies, self.headers))
  File "/root/anaconda3/lib/python3.7/site-packages/requests/api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "/root/anaconda3/lib/python3.7/site-packages/requests/api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "/root/anaconda3/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
    resp = self.send(prep, **

1 articles downloaded from tempo  using newspaper, url:  https://fokus.tempo.co/read/1383986/menakar-ancaman-jerman-agar-rusia-ungkap-dalang-peracun-alexei-navalny
2 articles downloaded from tempo  using newspaper, url:  https://fokus.tempo.co/read/1383582/ancaman-kotak-kosong-di-pilkada-2020
3 articles downloaded from tempo  using newspaper, url:  https://seleb.tempo.co/read/1384039/pesannya-terus-diabaikan-nella-kharisma-inul-daratista-artis-daerah-rasa-diva
4 articles downloaded from tempo  using newspaper, url:  https://seleb.tempo.co/read/1383973/puji-acara-tv-edukatif-jepang-jerome-polin-bandingkan-dengan-indonesia
5 articles downloaded from tempo  using newspaper, url:  https://seleb.tempo.co/read/1384120/anya-geraldine-akui-punya-perasaan-terhadap-rizky-febian
6 articles downloaded from tempo  using newspaper, url:  https://gaya.tempo.co/read/1384099/kepribadian-bisa-dibaca-lewat-posisi-bercinta-anda-pilih-yang-mana
7 articles downloaded from tempo  using newspaper, url:  https

50 articles downloaded from tempo  using newspaper, url:  https://tekno.tempo.co/read/1384290/astrazeneca-setop-sementara-uji-coba-vaksin-covid-19-setelah-satu-relawan-sakit
51 articles downloaded from tempo  using newspaper, url:  https://tekno.tempo.co/read/1384244/apple-bikin-acara-khusus-pada-15-september-rilis-iphone-12
52 articles downloaded from tempo  using newspaper, url:  https://tekno.tempo.co/read/1384218/peneliti-hong-kong-sampel-tinja-bisa-dipakai-deteksi-covid-19
53 articles downloaded from tempo  using newspaper, url:  https://bola.tempo.co/read/1384403/dikaitkan-dengan-rumor-transfer-liverpool-thiago-alcantara-diberi-libur-latihan
54 articles downloaded from tempo  using newspaper, url:  https://bola.tempo.co/read/1384393/nomor-kostum-bergengsi-di-barcelona-pemiliknya-yang-bertahan-dan-pergi
55 articles downloaded from tempo  using newspaper, url:  https://bola.tempo.co/read/1384368/9-rumor-bursa-transfer-terkini-juventus-barcelona-ac-milan-roma-chelsea
56 articles dow

In [29]:
cc=0
for col in dff.columns:    
    try:
        if cc==0:
            df=pd.DataFrame(data["newspapers"][col]["articles"])
            cc=1
        else:
            df=df.append(data["newspapers"][col]["articles"],ignore_index = True)
    except Exception as e:
                print("e")


cnn
0
detik
1
kompas
1
liputan 6
1
e
kapanlagi
1
e
merdeka
1
e
tempo
1


In [31]:
# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: 
    print(e)

# Saves as csv
df.to_json('scraped_articles.json')

print("saved!")

saved!


In [32]:
df = pd.read_json("scraped_articles.json") 
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 99
Data columns (total 6 columns):
keywords     357 non-null object
link         357 non-null object
published    357 non-null object
summary      357 non-null object
text         357 non-null object
title        357 non-null object
dtypes: object(6)
memory usage: 19.5+ KB


Unnamed: 0,keywords,link,published,summary,text,title
0,positif covid19 dari update 203342 sembuh itu ...,https://www.cnnindonesia.com/nasional/20200909...,2020-09-09T15:35:05,"Jakarta, CNN Indonesia --Kasus positif virus c...","Jakarta, CNN Indonesia --\n\nKasus positif vir...","Update Corona 9 September: 203.342 Positif, 14..."
1,tempat covid19 tidur hampir rumah di kasus 55 ...,https://www.cnnindonesia.com/nasional/20200909...,2020-09-09T15:34:38,"Denpasar, CNN Indonesia --Kepala Dinas Kesehat...","Denpasar, CNN Indonesia --\n\nKepala Dinas Kes...",55 Rumah Sakit Rujukan Covid-19 di Bali Hampir...
10,riedl mantan berduka alfred timnas ditinggal d...,https://www.cnnindonesia.com/olahraga/20200909...,2020-09-09T13:56:47,"Jakarta, CNN Indonesia --Mantan pelatih timnas...","Jakarta, CNN Indonesia --\n\nMantan pelatih ti...",Mantan Pelatih Malaysia Berduka Ditinggal Alfr...
100,tentang tengah jakarta dki tahun 30 naik parki...,https://oto.detik.com/oto-galeri/d-5164935/paj...,2020-09-08T20:57:18,FOLLOW detikOtoFoto Oto Pajak Parkir DKI Jakar...,FOLLOW detikOto\n\nFoto Oto Pajak Parkir DKI J...,Pajak Parkir DKI Jakarta Naik Jadi 30% di Teng...
101,bemo milik dirombak ibu keliling membaca perpu...,https://oto.detik.com/oto-galeri/d-5164920/fot...,2020-09-08T19:41:20,Bemo milik Sutino ini kerap dinanti anak-anak ...,Bemo milik Sutino ini kerap dinanti anak-anak ...,Foto Bemo yang Dirombak Jadi Perpustakaan Keli...
