In [1]:
import json

from protest_impact.data.news.download import get_protest_article_metadata
from protest_impact.util import project_root

with open(project_root / "data" / "news" / "scrapable_mediacloud_newspapers.json") as f:
    scrapable_newspapers = json.load(f)

In [18]:
import spacy

from protest_impact.data.protests.config import search_regex

nlp = spacy.load("de_core_news_sm", disable=["parser", "tagger", "ner", "tokenizer"])
nlp.add_pipe("sentencizer")


def kwic(text):
    sents = list(nlp(text).sents)
    kwics_nrs = set()
    for i, sent in enumerate(sents):
        if search_regex.search(sent.text):
            kwics_nrs.add(i)
            kwics_nrs.add(i - 1)
            kwics_nrs.add(i + 1)
    kwic_text = ""
    for kwic_nr in sorted(list(kwics_nrs)):
        if kwic_nr >= 0 and kwic_nr < len(sents):
            if kwic_nr - 1 not in kwics_nrs:
                kwic_text += "\n...\n"
            kwic_text += sents[kwic_nr].text
    return kwic_text

In [2]:
# newspapers that are too slow to practically scrape
# 131441 braunschweiger-zeitung.de
# 346325 mittelbayerische.de
# 385456 Salzgitter Zeitung - Salzgitter Zeitung
# 385468 Wolfsburger Nachrichten - Wolfsburger Nachrichten
# 385543 http://www.wz.de/home
# 385588 page | MDR.DE

slow_newspapers = [131441, 346325, 385456, 385468, 385543, 385588]
scraped_newspapers = [
    newspaper
    for newspaper in scrapable_newspapers
    if newspaper["media_id"] not in slow_newspapers
]

In [4]:
with open(
    project_root / "data" / "news" / "scrapable_mediacloud_newspapers_v1.json", "w"
) as f:
    json.dump(scraped_newspapers, f, indent=2)

In [22]:
import dataclasses
import re
from collections import Counter

import jsonlines
from dateutil import parser
from tqdm import tqdm

from protest_impact.types import NewsItem
from protest_impact.util import fulltext_path, website_name

with open(project_root / "data" / "news" / "protest_news.jsonl", "w") as writer:
    for i, newspaper in enumerate(scraped_newspapers):
        print(i, newspaper["name"])
        articles = list(get_protest_article_metadata(newspaper["media_id"]))
        print("\t", len(articles))
        lines = []
        missing = 0
        for article in articles:
            if article["publish_date"] is None:
                continue
            article = NewsItem(
                parser.parse(article["publish_date"]),
                article["url"],
                article["title"],
                None,
            )
            if not fulltext_path(article).exists():
                missing += 1
                continue
            with open(fulltext_path(article)) as reader:
                article_with_fulltext = NewsItem.from_str(reader.read())
            for line in article_with_fulltext.content.split("\n"):
                if line.strip() not in ["", "* * *"]:
                    lines.append(line.strip())
        common_lines = [
            line
            for line, count in Counter(lines).most_common(100)
            if count > len(articles) * 0.5
        ]
        print("\t", common_lines)
        for article in articles:
            if article["publish_date"] is None:
                continue
            article = NewsItem(
                parser.parse(article["publish_date"]),
                article["url"],
                article["title"],
                None,
            )
            if not fulltext_path(article).exists():
                continue
            with open(fulltext_path(article)) as reader:
                article_with_fulltext = NewsItem.from_str(reader.read())
            content = "\n".join(
                [
                    line.strip()
                    for line in article_with_fulltext.content.split("\n")
                    if line.strip() not in common_lines
                ]
            )
            content = re.sub(r"(?<!\n)\n(?!\n)", " ", content)
            data_item = {
                "text": content,
                "meta": {
                    "date": article_with_fulltext.date.isoformat(),
                    "title": article_with_fulltext.title,
                    "url": article_with_fulltext.url,
                    "homepage": website_name(article_with_fulltext.url),
                    "crawl_engine": "mediacloud",
                    "crawl_query": "protest",
                },
            }
            writer.write(json.dumps(data_item, ensure_ascii=False) + "\n")
        print(f"\t{missing} missing out of {len(articles)}")

0 hna
	 799
	 ['1. Startseite', 'Teilen']
	0 missing out of 799
1 tz.de
	 9461
	 ['1. tz', 'Teilen']
	0 missing out of 9461
2 merkur-online
	 810
	 ['1. Startseite', 'Teilen', '2. Lokales']
	0 missing out of 810
3 Christ & Welt: ZEIT für Glaube, Geist und Gesellschaft
	 7556
	 ['## Vielen Dank! Wir haben Ihnen eine E-Mail geschickt.', 'Prüfen Sie Ihr Postfach und bestätigen Sie das Newsletter-Abonnement.', 'Mit Ihrer Registrierung nehmen Sie die Datenschutzerklärung zur Kenntnis.', 'Newsletter']
	0 missing out of 7556
4 wn.de
	 21520
	 ['Startseite']
	0 missing out of 21520
5 augsburger-allgemeine.de
	 5828
	 []
	0 missing out of 5828
6 kreiszeitung.de
	 3855
	 ['1. Startseite', 'Teilen']
	0 missing out of 3855
7 op-online.de
	 9369
	 ['1. Startseite', 'Teilen']
	0 missing out of 9369
8 Oberpfalznetz
	 10760
	 []
	260 missing out of 10760
9 Nachrichten aus Amberg - Onetz
	 10548
	 []
	271 missing out of 10548
10 page - Solinger Tageblatt
	 2845
	 []
	0 missing out of 2845
11 Nachrichte

In [24]:
with open(project_root / "data" / "news" / "protest_news.jsonl") as reader:
    lines = reader.readlines()
# shuffle lines
import random

random.seed(2023020302)
random.shuffle(lines)
with open(
    project_root / "data" / "news" / "protest_news_shuffled.jsonl", "w"
) as writer:
    writer.writelines(lines)

In [35]:
from protest_impact.data.protests.config import search_regex

search_regex.pattern.lower()

'.*protest.*|versammlung.*|demonstr.*|kundgebung.*|kampagne.*|soziale bewegung.*|hausbesetzung.*|streik.*|unterschriftensammlung.*|hasskriminalität.*|unruhen.*|aufruhr.*|aufstand.*|boykott.*|riot.*|aktivis.*|widerstand.*|mobilisierung.*|bürgerinitiative.*|bürgerbegehren.*'

In [32]:
!prodigy textcat.manual protest_news_test ../../data/news/protest_news.jsonl --label relevant,irrelevant

Using 2 label(s): relevant, irrelevant

✨  Starting the web server at http://localhost:8080 ...
Open the app in your browser and start annotating!

^C
