Generating all the data and putting them into CSVs, rather than using the cached custom functions. Reduces complexity a bit.


In [1]:
%load_ext autoreload
%autoreload complete

In [2]:
from protest_impact.data.protests import get_climate_protests, get_climate_queries

queries = get_climate_queries()

In [3]:
for v in queries.values():
    print(v[0])
    print(v[1]("Das Klima ist angenehm."))
    print(v[1]("Schon wieder Klimaproteste."))
    print(v[1]("Die Proteste sind vorbei."))

(klima OR klimawandel OR erderwärmung OR klimaschutz OR klimagerechtigkeit OR "hambacher forst" OR hambi OR lütz* OR klimakrise OR klimakatastrophe OR klimakollaps OR klimanotstand OR klimagerechtigkeit OR klimaneutralität OR klimaneutral OR klimaziel OR klimaschutzpaket OR klimaschutzgesetz* OR kohleausstieg OR "erneuerbare energie*" OR bürgerrat OR bürgerräte) AND (protest* OR demo OR demonstr* OR kundgebung OR versamm* OR "soziale bewegung" OR hausbesetz* OR streik* OR unterschriften* OR petition OR hasskriminalität OR unruhen OR aufruhr OR aufstand OR rebell* OR blockade OR blockier* OR sitzblock* OR boykott* OR riot OR aktivis* OR bürgerinitiative OR bürgerbegehren OR marsch OR aufmarsch OR parade OR mahnwache OR hungerstreik OR "ziviler ungehorsam" OR "fridays for future" OR fridaysforfuture OR fridays4future OR "extinction rebellion" OR "just stop oil" OR "letzte generation" OR "ultima generazione" OR "ende gelände" OR klimabewegung OR klimaaktivis* OR klimastreik*)
False
False


In [4]:
import json
from datetime import date
from itertools import product

import numpy as np
import pandas as pd
from joblib.hashing import hash
from protest_impact.data.news.sources.dereko import get_scraped_entries
from protest_impact.synthetic_region import (
    filter_regions,
    get_regional_counts_for_protest,
)
from protest_impact.util import project_root
from tqdm.notebook import tqdm

SEED = 20230429
rng = np.random.default_rng(SEED)
dereko_entries = dict()
for t in ["climate"]:  # , "climate_and_protest", "climate_not_protest"]:
    dereko_entries[t] = get_scraped_entries(discourse_type=t)


def get_data(parameters):
    p = type(
        "Parameters", (object,), parameters
    )  # convert to object for better legibility
    protests = get_climate_protests(
        start_date=p.start_date, end_date=p.end_date, groups=[p.protest_group]
    )
    query = queries[p.discourse_type]
    args_list = []
    for event in tqdm(protests.to_dict(orient="records")):
        args = dict(**parameters, **event)
        h = hash([v for k, v in args.items() if k != "discourse_type"])
        json_path = (
            project_root / "data" / "synthetic-region" / "time-series" / f"{h}.json"
        )
        csv_path = (
            project_root
            / "data"
            / "synthetic-region"
            / "time-series"
            / f"{h}_{p.discourse_type[:-8]}.csv"
        )
        df = get_regional_counts_for_protest(
            query_str=query[0],
            query_func=query[1],
            region=event["admin1"],
            event_date=event["event_date"],
            source=p.source,
            n_days_train=None,
            n_days_predict=p.prediction_interval,
            dereko_entries=dereko_entries[p.discourse_type],
        )
        if df is None:
            continue
        df = filter_regions(
            df=df,
            region=event["admin1"],
            event_date=event["event_date"],
            reference_events=protests,
            n_days_protest_free_pre=1,
            n_days_protest_free_post=1,
            min_control_regions=0,
            min_count=0,
        )
        if df is None:
            continue
        with open(json_path, "w") as f:
            for k, v in args.items():
                if isinstance(v, date):
                    args[k] = v.isoformat()
            json.dump(args, f)
        df.to_csv(csv_path)
        args_list.append(dict(**args, **dict(hash=h, csv=csv_path)))
    return args_list


parameters = dict(
    start_date=[date(2020, 1, 1)],
    end_date=[date(2022, 12, 31)],
    protest_group=["fff", "alg", "xr", "eg"],
    source=["dereko_scrape"],  # "mediacloud"],
    discourse_type=["climate", "climate_not_protest", "climate_and_protest"],
    prediction_interval=[28],
)
args_list = []
evaluated_parameters = []
for combination in tqdm(list(product(*parameters.values()))):
    c = dict(zip(parameters.keys(), combination))
    evaluated_parameters.append(c)
    a = get_data(c)
    args_list.extend(a)
df = pd.DataFrame(evaluated_parameters)
df

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [7]:
df = pd.DataFrame(args_list)
df.to_csv(project_root / "data" / "synthetic-region" / "overview.csv", index=False)
df.head()

Unnamed: 0,start_date,end_date,protest_group,source,discourse_type,co_terms,prediction_interval,event_date,sub_event_type,assoc_actor_1,admin1,location,notes,weekday,region_code,size,hash,csv
0,2020-01-01,2022-12-31,fff,dereko_scrape,climate,,28,2022-11-05T00:00:00,Peaceful protest,FFF: Fridays for Future; Students (Germany),Sachsen,Dresden,"On 5 November 2022, members of FFF (including ...",Saturday,SN,,fddd78e20627fe6c2e8925b266bbe89f,/Users/david/Repositories/protest-impact/data/...
1,2020-01-01,2022-12-31,fff,dereko_scrape,climate,,28,2022-10-12T00:00:00,Peaceful protest,FFF: Fridays for Future; Students (Germany),Nordrhein-Westfalen,Euskirchen,"Around 12 October 2022 (as reported), more tha...",Wednesday,NW,100.0,ab21bbb239a63d784f94eb12098d6e4f,/Users/david/Repositories/protest-impact/data/...
2,2020-01-01,2022-12-31,fff,dereko_scrape,climate,,28,2022-10-07T00:00:00,Peaceful protest,FFF: Fridays for Future; Students (Germany),Nordrhein-Westfalen,Aachen,"On 7 October 2022, about 50 activists affiliat...",Friday,NW,50.0,d18b35a7667f5c5645ac77b7f2115338,/Users/david/Repositories/protest-impact/data/...
3,2020-01-01,2022-12-31,fff,dereko_scrape,climate,,28,2022-10-07T00:00:00,Peaceful protest,FFF: Fridays for Future; Students (Germany),Berlin,Berlin,"On 7 October 2022, activists affiliated with t...",Friday,BE,,46c419fd32d7883f292cbcb2837109bd,/Users/david/Repositories/protest-impact/data/...
4,2020-01-01,2022-12-31,fff,dereko_scrape,climate,,28,2022-10-07T00:00:00,Peaceful protest,FFF: Fridays for Future; Students (Germany),Sachsen,Dresden,"On 7 October 2022, activists affiliated with t...",Friday,SN,,2c55a6c8f4c60db74ed102be264c5d8a,/Users/david/Repositories/protest-impact/data/...
