In [23]:
%load_ext autoreload
%autoreload complete

Getting time-series data for many words from MediaCloud. See also previous notebook on different MediaCloud approaches.


## Approach 1


In [12]:
from os import environ

from requests import get

from src.util.cache import get_cached

response = get_cached(
    "https://api.mediacloud.org/api/v2/stories_public/word_matrix/",
    params={
        "q": "media_id:69752",
        "fq": "publish_date:[2020-01-01T00:00:00Z TO 2023-01-01T00:00:00Z]",
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()

In [3]:
list(result["word_matrix"].items())[:2]

[('1483914825',
  {'11417': 1,
   '12954': 1,
   '13035': 1,
   '14218': 1,
   '1460': 2,
   '14808': 1,
   '1563': 1,
   '157': 1,
   '18367': 1,
   '19429': 1,
   '19430': 1,
   '19431': 3,
   '19432': 1,
   '19433': 1,
   '19434': 1,
   '19435': 1,
   '19436': 2,
   '19437': 1,
   '19438': 2,
   '19439': 1,
   '19440': 1,
   '19441': 1,
   '19442': 1,
   '196': 1,
   '2762': 1,
   '2972': 3,
   '3135': 1,
   '3605': 1,
   '4167': 1,
   '4437': 1,
   '5539': 2,
   '6059': 1,
   '6442': 1,
   '6825': 1,
   '93': 1,
   '9973': 1}),
 ('1483945263',
  {'0': 1,
   '1': 1,
   '10': 1,
   '100': 3,
   '101': 4,
   '102': 1,
   '103': 1,
   '104': 1,
   '105': 1,
   '106': 1,
   '107': 1,
   '108': 1,
   '109': 1,
   '11': 1,
   '110': 1,
   '111': 1,
   '112': 5,
   '12': 1,
   '13': 1,
   '14': 1,
   '15': 1,
   '16': 1,
   '17': 1,
   '18': 1,
   '19': 1,
   '2': 1,
   '20': 1,
   '21': 1,
   '22': 2,
   '23': 1,
   '24': 1,
   '25': 1,
   '26': 1,
   '27': 1,
   '28': 1,
   '29': 1,
   '

In [4]:
from collections import Counter


def transform_list(list_):
    return Counter(
        {result["word_list"][int(k)][1]: v for k, v in list_.items()}
    ).most_common()


transform_list(list(result["word_matrix"].values())[0])

[('zoo', 3),
 ('tiere', 3),
 ('folgen', 2),
 ('silvesterfeuerwerke', 2),
 ('tierschutzbund', 2),
 ('brand', 2),
 ('negativen', 1),
 ('dramatischen', 1),
 ('verursacht', 1),
 ('tierschützer', 1),
 ('orten', 1),
 ('deutsche', 1),
 ('dpa', 1),
 ('bonn', 1),
 ('feuerwerkskörper', 1),
 ('belege', 1),
 ('sicherheitszonen', 1),
 ('tragödie', 1),
 ('böllerei', 1),
 ('schutzzonen', 1),
 ('lebten', 1),
 ('brückner', 1),
 ('krefeld', 1),
 ('unkontrollierte', 1),
 ('schrecklich', 1),
 ('deutschen', 1),
 ('private', 1),
 ('kritisiert', 1),
 ('landwirtschaftliche', 1),
 ('james', 1),
 ('betrieb', 1),
 ('gefordert', 1),
 ('tierheim', 1),
 ('fordern', 1),
 ('min', 1),
 ('untersagt', 1)]

In [5]:
def transform_matrix(matrix):
    return {
        media_id: transform_list(list_)
        for media_id, list_ in result["word_matrix"].items()
    }


word_matrix = transform_matrix(result["word_matrix"])
list(word_matrix.items())[:2]

[('1483914825',
  [('zoo', 3),
   ('tiere', 3),
   ('folgen', 2),
   ('silvesterfeuerwerke', 2),
   ('tierschutzbund', 2),
   ('brand', 2),
   ('negativen', 1),
   ('dramatischen', 1),
   ('verursacht', 1),
   ('tierschützer', 1),
   ('orten', 1),
   ('deutsche', 1),
   ('dpa', 1),
   ('bonn', 1),
   ('feuerwerkskörper', 1),
   ('belege', 1),
   ('sicherheitszonen', 1),
   ('tragödie', 1),
   ('böllerei', 1),
   ('schutzzonen', 1),
   ('lebten', 1),
   ('brückner', 1),
   ('krefeld', 1),
   ('unkontrollierte', 1),
   ('schrecklich', 1),
   ('deutschen', 1),
   ('private', 1),
   ('kritisiert', 1),
   ('landwirtschaftliche', 1),
   ('james', 1),
   ('betrieb', 1),
   ('gefordert', 1),
   ('tierheim', 1),
   ('fordern', 1),
   ('min', 1),
   ('untersagt', 1)]),
 ('1483945263',
  [('eis', 5),
   ('eisfläche', 4),
   ('dlrg', 4),
   ('betreten', 4),
   ('mönchsdeggingen', 3),
   ('warnt', 3),
   ('tragfähige', 2),
   ('eingebrochenen', 2),
   ('sicher', 2),
   ('minuten', 2),
   ('verunfal

In [6]:
len(result["word_list"]), len(result["word_matrix"])

(55070, 1718)

In [7]:
from src.data.news.config import region_tags

region_tags

{'germany_national': 34412409,
 'germany_regional': 38379816,
 'baden-württemberg': 262985085,
 'bayern': 38379825,
 'berlin': 38379823,
 'brandenburg': 38379827,
 'bremen': 38379829,
 'hamburg': 38379819,
 'hessen': 38379821,
 'mecklenburg-vorpommern': 262985084,
 'niedersachsen': 38379831,
 'nordrhein-westfalen': 38379817,
 'rheinland-pfalz': 38379833,
 'saarland': 38379835,
 'sachsen': 38379837,
 'sachsen-anhalt': 38379839,
 'schleswig-holstein': 38379841,
 'thüringen': 38379843}

In [15]:
response = get(
    "https://api.mediacloud.org/api/v2/stories_public/word_matrix/",
    params={
        "q": f"tags_id_media:{region_tags['berlin']}",
        "fq": "publish_date:[2020-01-04T00:00:00Z TO 2020-01-05T00:00:00Z]",
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()

In [14]:
word_list, word_matrix = result["word_list"], result["word_matrix"]
len(word_list), len(word_matrix)

(0, 0)

This is nice, but it would take many requests:


In [10]:
16 * 365 * (2022 - 2019 + 1)

23360

The rate limit is 10.000, so it would take 2 weeks to get the complete data. How long roughly?


In [16]:
seconds = 16 * 365 * (2022 - 2019 + 1) * 20
hours = seconds / 3600
days = hours / 24
days

5.407407407407407

That's a lot, but might be feasible. Also, other regions than Berlin are probably smaller and faster.


In [17]:
response = get(
    "https://api.mediacloud.org/api/v2/stories_public/word_matrix/",
    params={
        "q": f"tags_id_media:{region_tags['niedersachsen']}",
        "fq": "publish_date:[2020-01-04T00:00:00Z TO 2020-01-05T00:00:00Z]",
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()

In [None]:
word_list, word_matrix = result["word_list"], result["word_matrix"]
len(word_list), len(word_matrix)

In [20]:
response = get(
    "https://api.mediacloud.org/api/v2/stories_public/word_matrix/",
    params={
        "q": f"tags_id_media:{region_tags['hessen']}",
        "fq": "publish_date:[2020-01-04T00:00:00Z TO 2020-01-05T00:00:00Z]",
        "rows": 100_000,
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()

In [21]:
word_list, word_matrix = result["word_list"], result["word_matrix"]
len(word_list), len(word_matrix)

(20867, 1129)

That took rather long ...


## Approach 2


In [8]:
from datetime import date
from os import environ

from src.cache import get_cached
from src.data.news.config import region_tags
from src.data.news.sources.mediacloud import search

response = get_cached(
    "https://api.mediacloud.org/api/v2/stories_public/list/",
    params={
        "last_processed_stories_id": 0,
        "rows": 1000,
        "q": f"tags_id_media:{region_tags['berlin']}",
        "fq": "publish_date:[2020-01-01T00:00:00Z TO 2020-02-01T00:00:00Z]",
        "wc": 1,
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()

In [12]:
len(result), result[1]

(1000,
 {'ap_syndicated': False,
  'collect_date': '2020-01-01 01:03:48.026433',
  'feeds': None,
  'guid': 'https://www.rbb24.de/kultur/berlinale/beitraege/2020/gruetters-berlinale-jubilaeum-70-glamour-berlin-filmfestival.html',
  'language': 'de',
  'media_id': 385507,
  'media_name': 'radioBERLIN 88,8',
  'media_url': 'http://www.radioberlin.de/',
  'processed_stories_id': 1880312907,
  'publish_date': '2020-01-01 01:03:46',
  'stories_id': 1483758545,
  'story_tags': [],
  'title': 'Gr&#252;tters: Berlinale geht nicht ohne "Glamourfaktor"',
  'url': 'https://www.rbb24.de/kultur/berlinale/beitraege/2020/gruetters-berlinale-jubilaeum-70-glamour-berlin-filmfestival.html',
  'word_count': [{'count': 5, 'stem': 'festiv', 'term': 'festival'},
   {'count': 1, 'stem': 'sieht', 'term': 'sieht'},
   {'count': 1, 'stem': 'zusätzlichen', 'term': 'zusätzlichen'},
   {'count': 1, 'stem': 'ungewöhnlicher', 'term': 'ungewöhnlichere'},
   {'count': 2, 'stem': 'zudem', 'term': 'zudem'},
   {'count':

In [15]:
{a["publish_date"][:10] for a in result}

{'2020-01-01', '2020-01-02', '2020-01-03'}

In [19]:
response = get_cached(
    "https://api.mediacloud.org/api/v2/stories_public/list/",
    params={
        "last_processed_stories_id": 0,
        "rows": 1000,
        "q": f"tags_id_media:{region_tags['hessen']}",
        "fq": "publish_date:[2020-01-01T00:00:00Z TO 2020-02-01T00:00:00Z]",
        "wc": 1,
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()
{a["publish_date"][:10] for a in result if "publish_date" in a and a["publish_date"]}

{'2020-01-01', '2020-01-02'}

In [21]:
response = get_cached(
    "https://api.mediacloud.org/api/v2/stories_public/list/",
    params={
        "last_processed_stories_id": 0,
        "rows": 1000,
        "q": f"tags_id_media:{region_tags['baden-württemberg']}",
        "fq": "publish_date:[2020-01-01T00:00:00Z TO 2020-02-01T00:00:00Z]",
        "wc": 1,
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()
{a["publish_date"][:10] for a in result if "publish_date" in a and a["publish_date"]}

{'2020-01-01',
 '2020-01-02',
 '2020-01-03',
 '2020-01-04',
 '2020-01-05',
 '2020-01-06',
 '2020-01-07',
 '2020-01-08',
 '2020-01-09',
 '2020-01-10',
 '2020-01-11',
 '2020-01-12',
 '2020-01-13',
 '2020-01-14',
 '2020-01-15',
 '2020-01-16',
 '2020-01-17',
 '2020-01-18',
 '2020-01-19',
 '2020-01-20',
 '2020-01-21',
 '2020-01-22',
 '2020-01-23',
 '2020-01-24',
 '2020-01-25',
 '2020-01-26',
 '2020-01-27',
 '2020-01-29',
 '2020-01-30',
 '2020-01-31',
 '2020-02-01'}

In [22]:
response = get_cached(
    "https://api.mediacloud.org/api/v2/stories_public/list/",
    params={
        "last_processed_stories_id": 0,
        "rows": 1000,
        "q": f"tags_id_media:{region_tags['berlin']}",
        "fq": "publish_date:[2022-12-01T00:00:00Z TO 2023-01-01T00:00:00Z]",
        "wc": 1,
        "key": environ["MEDIACLOUD_API_KEY"],
    },
    headers={"Accept": "application/json"},
)
result = response.json()
{a["publish_date"][:10] for a in result if "publish_date" in a and a["publish_date"]}
max([a["publish_date"] for a in result if "publish_date" in a and a["publish_date"]])

'2022-12-01 07:50:03'

For some regions, this approach saves a lot of time, for others it's much slower ...