In [41]:
%load_ext autoreload
%autoreload complete
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from tqdm.notebook import tqdm

from protest_impact.util import cache, project_root

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
from protest_impact.data.protests import get_climate_protests

protests = get_climate_protests(groups=["fff", "alg"])
protests.head()

Unnamed: 0,event_date,sub_event_type,assoc_actor_1,admin1,location,notes,weekday,region_code,size,n_protests,largest_protest_size,has_mixed_groups
0,2020-01-10,Peaceful protest,FFF,Bayern,Munchen,"On 10 January 2020, about 500 FFF activists an...",Friday,BY,550.0,5,500.0,False
1,2020-01-13,Peaceful protest,FFF,Bayern,Munchen,"On 13 January 2020, about 150 people, includin...",Monday,BY,150.0,2,150.0,False
2,2020-01-18,Peaceful protest,FFF,Bayern,Bad Neustadt an der Saale,"On 18 January 2020, about 25 people, including...",Saturday,BY,25.0,1,25.0,False
3,2020-01-24,Peaceful protest,FFF,Bayern,Bad Tolz,"On 24 January 2020, dozens of FFF activists an...",Friday,BY,50.0,1,50.0,False
4,2020-01-31,Peaceful protest,FFF,Bayern,Puchheim,"On 31 January 2020, FFF activists and students...",Friday,BY,0.0,1,,False


In [43]:
# aggregate, such that the total size and the number of protests are available
protests = protests.groupby(["event_date", "admin1", "assoc_actor_1"]).agg(
    {"size": ["sum", "count"]}
)
# flatten the multi-index
protests.columns = ["_".join(a) for a in protests.columns]
protests = protests.reset_index()
protests.head()

Unnamed: 0,event_date,admin1,assoc_actor_1,size_sum,size_count
0,2020-01-02,Nordrhein-Westfalen,FFF,100.0,1
1,2020-01-03,Hamburg,FFF,300.0,1
2,2020-01-03,Hessen,FFF,145.0,1
3,2020-01-03,Nordrhein-Westfalen,FFF,450.0,1
4,2020-01-10,Bayern,FFF,550.0,1


In [44]:
# pivot such that each actor is a column
protests = protests.pivot(
    index=["event_date", "admin1"],
    columns="assoc_actor_1",
    values=["size_sum", "size_count"],
)
protests = protests.fillna(0)
protests.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,size_sum,size_sum,size_count,size_count
Unnamed: 0_level_1,assoc_actor_1,ALG,FFF,ALG,FFF
event_date,admin1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-02,Nordrhein-Westfalen,0.0,100.0,0.0,1.0
2020-01-03,Hamburg,0.0,300.0,0.0,1.0
2020-01-03,Hessen,0.0,145.0,0.0,1.0
2020-01-03,Nordrhein-Westfalen,0.0,450.0,0.0,1.0
2020-01-10,Bayern,0.0,550.0,0.0,1.0


In [45]:
# add rows for each date and admin1, and fill with 0
dates = pd.date_range("2020-01-01", "2022-12-31", freq="D")
admin1s = protests.index.get_level_values(1).unique()
protests = protests.reindex(pd.MultiIndex.from_product([dates, admin1s]), fill_value=0)
protests.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,size_sum,size_sum,size_count,size_count
Unnamed: 0_level_1,assoc_actor_1,ALG,FFF,ALG,FFF
Unnamed: 0_level_2,admin1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0
2020-01-01,Hamburg,0.0,0.0,0.0,0.0
2020-01-01,Hessen,0.0,0.0,0.0,0.0
2020-01-01,Bayern,0.0,0.0,0.0,0.0
2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0
2020-01-01,Schleswig-Holstein,0.0,0.0,0.0,0.0
2020-01-01,Berlin,0.0,0.0,0.0,0.0
2020-01-01,Rheinland-Pfalz,0.0,0.0,0.0,0.0
2020-01-01,Bremen,0.0,0.0,0.0,0.0
2020-01-01,Sachsen,0.0,0.0,0.0,0.0


In [46]:
# flatten the multi-index
protests.columns = ["_".join(a) for a in protests.columns]
protests = protests.reset_index()
protests = protests.rename(
    columns={
        "level_0": "date",
        "admin1": "region",
        "size_sum_FFF": "size_moderate",
        "size_count_FFF": "count_moderate",
        "size_sum_ALG": "size_radical",
        "size_count_ALG": "count_radical",
    }
)
protests.head()

Unnamed: 0,date,region,size_radical,size_moderate,count_radical,count_moderate
0,2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0
1,2020-01-01,Hamburg,0.0,0.0,0.0,0.0
2,2020-01-01,Hessen,0.0,0.0,0.0,0.0
3,2020-01-01,Bayern,0.0,0.0,0.0,0.0
4,2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0


In [47]:
from datetime import date

from protest_impact.data.news import get_regional_count_df
from protest_impact.data.protests import get_climate_queries

sources = ["mediacloud"]  # TODO: "dereko_scrape"
queries = get_climate_queries()

coverage_df = pd.DataFrame()
for region in protests["region"].unique():
    region_df = pd.DataFrame()
    for source in sources:
        for query_name, query in queries.items():
            df = get_regional_count_df(
                region=region,
                start_date=date(2020, 1, 1),
                end_date=date(2022, 12, 31),
                source=source,
                query_string=query[0],
                query_func=query[1],
            )
            df = df.rename(columns={"count": f"{query_name}_{source}"})
            if region_df.empty:
                region_df = df
            else:
                region_df = region_df.merge(df, on="date", how="outer")
    region_df["region"] = region
    coverage_df = pd.concat([coverage_df, region_df], axis=0, ignore_index=True)
coverage_df.head()

Unnamed: 0,date,climate_and_protest_mediacloud,climate_not_protest_mediacloud,climate_mediacloud,region
0,2020-01-01,10,42,52,Nordrhein-Westfalen
1,2020-01-02,14,68,82,Nordrhein-Westfalen
2,2020-01-03,17,37,54,Nordrhein-Westfalen
3,2020-01-04,12,17,29,Nordrhein-Westfalen
4,2020-01-05,8,17,25,Nordrhein-Westfalen


In [48]:
protests = protests.merge(coverage_df, on=["date", "region"], how="outer")
protests.head(25)

Unnamed: 0,date,region,size_radical,size_moderate,count_radical,count_moderate,climate_and_protest_mediacloud,climate_not_protest_mediacloud,climate_mediacloud
0,2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0,10,42,52
1,2020-01-01,Hamburg,0.0,0.0,0.0,0.0,5,9,14
2,2020-01-01,Hessen,0.0,0.0,0.0,0.0,18,58,76
3,2020-01-01,Bayern,0.0,0.0,0.0,0.0,1,21,22
4,2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0,4,24,28
5,2020-01-01,Schleswig-Holstein,0.0,0.0,0.0,0.0,1,6,7
6,2020-01-01,Berlin,0.0,0.0,0.0,0.0,4,13,17
7,2020-01-01,Rheinland-Pfalz,0.0,0.0,0.0,0.0,2,3,5
8,2020-01-01,Bremen,0.0,0.0,0.0,0.0,0,3,3
9,2020-01-01,Sachsen,0.0,0.0,0.0,0.0,3,1,4


In [49]:
from protest_impact.data import german_regions
from protest_impact.instrumental_variables import get_instruments

protests_with_cities = protests.copy()
protests_with_cities["location"] = protests_with_cities["region"].apply(
    lambda x: [a["capital"] for a in german_regions if a["name"] == x][0]
)
protests_with_cities = protests_with_cities.rename(columns={"date": "event_date"})
instr = get_instruments(protests_with_cities)
# instr = instr[["pres", "prcp", "new_tests_smoothed_per_thousand", "stringency_index"]]
instr.columns = ["instr_" + a for a in instr.columns]
protests = pd.concat([protests, instr], axis=1)
protests.head()

Unnamed: 0,date,region,size_radical,size_moderate,count_radical,count_moderate,climate_and_protest_mediacloud,climate_not_protest_mediacloud,climate_mediacloud,instr_prcp,...,instr_longterm_tsun,instr_longterm_wdir,instr_longterm_wpgt,instr_longterm_wspd,instr_retail_and_recreation,instr_grocery_and_pharmacy,instr_residential,instr_transit_stations,instr_parks,instr_workplaces
0,2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0,10,42,52,0.0,...,102.0,182.4,45.03,18.28,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-01,Hamburg,0.0,0.0,0.0,0.0,5,9,14,0.0,...,62.4,204.7,49.61,19.59,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-01,Hessen,0.0,0.0,0.0,0.0,18,58,76,0.0,...,,204.100644,36.636,10.74,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-01,Bayern,0.0,0.0,0.0,0.0,1,21,22,0.0,...,118.8,224.340385,35.955769,9.548269,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0,4,24,28,0.0,...,85.2,222.4,45.08,27.62,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
protests["weekday"] = protests["date"].dt.day_name()
protests.head()

Unnamed: 0,date,region,size_radical,size_moderate,count_radical,count_moderate,climate_and_protest_mediacloud,climate_not_protest_mediacloud,climate_mediacloud,instr_prcp,...,instr_longterm_wdir,instr_longterm_wpgt,instr_longterm_wspd,instr_retail_and_recreation,instr_grocery_and_pharmacy,instr_residential,instr_transit_stations,instr_parks,instr_workplaces,weekday
0,2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0,10,42,52,0.0,...,182.4,45.03,18.28,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday
1,2020-01-01,Hamburg,0.0,0.0,0.0,0.0,5,9,14,0.0,...,204.7,49.61,19.59,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday
2,2020-01-01,Hessen,0.0,0.0,0.0,0.0,18,58,76,0.0,...,204.100644,36.636,10.74,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday
3,2020-01-01,Bayern,0.0,0.0,0.0,0.0,1,21,22,0.0,...,224.340385,35.955769,9.548269,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday
4,2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0,4,24,28,0.0,...,222.4,45.08,27.62,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday


In [51]:
import holidays

from protest_impact.data import german_regions

german_holidays = dict()
for region in german_regions:
    german_holidays[region["name"]] = list(
        holidays.Germany(years=range(2020, 2023), subdiv=region["code"]).keys()
    )
protests["is_holiday"] = protests.apply(
    lambda x: x["date"].date() in german_holidays[x["region"]], axis=1
)
protests.head()

Unnamed: 0,date,region,size_radical,size_moderate,count_radical,count_moderate,climate_and_protest_mediacloud,climate_not_protest_mediacloud,climate_mediacloud,instr_prcp,...,instr_longterm_wpgt,instr_longterm_wspd,instr_retail_and_recreation,instr_grocery_and_pharmacy,instr_residential,instr_transit_stations,instr_parks,instr_workplaces,weekday,is_holiday
0,2020-01-01,Nordrhein-Westfalen,0.0,0.0,0.0,0.0,10,42,52,0.0,...,45.03,18.28,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday,True
1,2020-01-01,Hamburg,0.0,0.0,0.0,0.0,5,9,14,0.0,...,49.61,19.59,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday,True
2,2020-01-01,Hessen,0.0,0.0,0.0,0.0,18,58,76,0.0,...,36.636,10.74,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday,True
3,2020-01-01,Bayern,0.0,0.0,0.0,0.0,1,21,22,0.0,...,35.955769,9.548269,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday,True
4,2020-01-01,Niedersachsen,0.0,0.0,0.0,0.0,4,24,28,0.0,...,45.08,27.62,0.0,0.0,0.0,0.0,0.0,0.0,Wednesday,True


In [52]:
protests["is_holiday"].value_counts()

False    14887
True       457
Name: is_holiday, dtype: int64

In [53]:
from protest_impact.util import project_root

protests.to_csv(project_root / "data" / "daily_data_neat.csv", index=False)