In [134]:
%load_ext autoreload
%autoreload complete

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [106]:
from src.data.news.coverage_filter import filter_protests
from src.data.protests.random import get_random_events

random_events = get_random_events(200, seed=0)

In [140]:
from src.models.synthetic_region import (
    evaluate_multiple_protests,
    get_regional_counts_for_protest,
    metrics,
    synthetic_region,
)
from src.models.synthetic_region.models import models


def evaluate(method):
    return evaluate_multiple_protests(
        "klimaschutz",
        random_events,
        source="mediacloud",
        data_kwargs=dict(
            n_days_train=7 * 4 * 6,
            n_days_predict=7 * 4,
            n_days_protest_free_pre=0,
            n_days_protest_free_post=0,
            min_control_regions=1,
            min_count=3,
        ),
        method_kwargs=dict(
            method=method,
        ),
    )

In [108]:
results, agg_df = evaluate(models["lasso"])

  0%|          | 0/200 [00:00<?, ?it/s]



In [109]:
import numpy as np

mae = np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)
mae

0.34737938810092894

In [110]:
protest = random_events.iloc[5]
protest

event_date    2021-01-03 00:00:00
admin1                     Berlin
Name: 5, dtype: object

## Develop co-term approach

In [111]:
term = "klimaschutz"
co_terms = [
    "klimawandel",
    "erderwärmung",
    "klima",
    "klimakrise",
    "umweltschutz",
    "umwelt",
    "energie",
]
df = get_regional_counts_for_protest(
    term,
    random_events.iloc[5],
    random_events,
    co_terms=co_terms,
    source="mediacloud",
    min_count=1,
)

In [112]:
pivot_df = df.pivot(index="date", columns=["region", "term"], values="count").sample(10)
pivot_df.columns = ["_".join(col) for col in pivot_df.columns]
pivot_df

Unnamed: 0_level_0,Berlin_klimaschutz,Bayern_klimaschutz,Bayern_klimawandel,Bayern_erderwärmung,Bayern_klima,Bayern_klimakrise,Bayern_umweltschutz,Bayern_umwelt,Bayern_energie,Brandenburg_klimaschutz,...,Sachsen_energie,Sachsen-Anhalt_umwelt,Schleswig-Holstein_klimaschutz,Schleswig-Holstein_klimawandel,Schleswig-Holstein_klima,Schleswig-Holstein_umweltschutz,Schleswig-Holstein_umwelt,Schleswig-Holstein_energie,Thüringen_umwelt,Thüringen_energie
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-26,7,49,12,6,19,6,4,45,58,2,...,17,1,8,5,6,0,8,10,0,0
2020-10-06,10,41,17,0,20,5,5,49,45,4,...,8,0,5,5,4,2,1,2,1,2
2020-11-29,5,6,21,0,13,3,17,41,30,0,...,1,0,0,3,0,4,2,0,0,0
2021-01-27,9,38,48,0,43,10,11,49,54,2,...,3,2,4,3,5,0,1,4,0,0
2020-08-25,10,10,12,1,4,2,4,34,37,3,...,11,2,1,0,1,1,1,5,0,0
2020-09-05,2,0,3,0,3,0,2,11,26,0,...,4,1,0,2,2,1,0,3,0,1
2020-08-29,5,14,5,1,15,0,5,11,9,1,...,5,0,2,0,0,0,3,2,0,0
2021-01-09,1,3,3,0,5,0,1,16,12,0,...,2,0,2,0,0,0,1,1,0,9
2020-12-11,13,41,14,28,36,4,10,61,34,4,...,15,0,11,3,4,0,6,7,1,2
2020-08-13,1,4,18,2,19,5,3,64,38,2,...,9,3,1,2,4,0,2,3,0,0


In [113]:
list(pivot_df.columns)[:10]

['Berlin_klimaschutz',
 'Bayern_klimaschutz',
 'Bayern_klimawandel',
 'Bayern_erderwärmung',
 'Bayern_klima',
 'Bayern_klimakrise',
 'Bayern_umweltschutz',
 'Bayern_umwelt',
 'Bayern_energie',
 'Brandenburg_klimaschutz']

In [114]:
result = synthetic_region(df, protest, method=models["ridge"])

In [115]:
result

{'df':            date      real     synth
 -168 2020-07-19  0.648649  0.590125
 -167 2020-07-20  0.908108  0.672452
 -166 2020-07-21  1.037838  1.042021
 -165 2020-07-22  1.297297  1.059444
 -164 2020-07-23  1.297297  1.480653
 ...         ...       ...       ...
  23  2021-01-26  0.908108  1.500248
  24  2021-01-27  1.167568  1.068240
  25  2021-01-28  1.167568  1.467433
  26  2021-01-29  1.037838  0.771684
  27  2021-01-30  0.778378  0.569134
 
 [196 rows x 3 columns],
 'model': Ridge()}

In [116]:
metrics(result["df"], protest["event_date"])["mae"]

0.3866697739594705

In [117]:
df = get_regional_counts_for_protest(
    term,
    random_events.iloc[5],
    random_events,
    co_terms=[],
    source="mediacloud",
    min_count=1,
)
result = synthetic_region(df, protest, method=models["ridge"])
metrics(result["df"], protest["event_date"])["mae"]

0.33671392182511334

## Evaluate it a bit

In [141]:
def evaluate(method, co_terms):
    return evaluate_multiple_protests(
        "klimaschutz",
        random_events,
        co_terms=co_terms,
        source="mediacloud",
        data_kwargs=dict(
            n_days_train=7 * 4 * 6,
            n_days_predict=7 * 4,
            n_days_protest_free_pre=0,
            n_days_protest_free_post=0,
            min_control_regions=1,
            min_count=3,
        ),
        method_kwargs=dict(
            method=method,
        ),
    )

## Ridge without co-terms

In [161]:
results, agg_df = evaluate(models["ridge"], co_terms=[])

  0%|          | 0/200 [00:00<?, ?it/s]



In [162]:
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

0.3489002474891261

In [163]:
from pprint import pprint

for i in range(3):
    coefs = results[i]["model"].coef_[1:]
    coef_keys = list(results[i]["pivod_df"].columns)
    coef_dict = dict(
        intercept=results[i]["model"].coef_[0], **dict(zip(coef_keys, coefs))
    )
    pprint(coef_dict)

KeyError: 'pivod_df'

### Ridge with co-terms

In [None]:
result, agg_df = evaluate(models["ridge"], co_terms=co_terms)

  0%|          | 0/200 [00:00<?, ?it/s]



In [None]:
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

0.3489002474891261

In [None]:
from pprint import pprint

for i in range(3):
    coefs = results[i]["model"].coef_[1:]
    coef_keys = list(results[i]["pivod_df"].columns)
    coef_dict = dict(
        intercept=results[i]["model"].coef_[0], **dict(zip(coef_keys, coefs))
    )
    pprint(coef_dict)

{'Bayern_klimaschutz': -0.01104267688345693,
 'Berlin_klimaschutz': -0.009068066221370664,
 'Brandenburg_klimaschutz': 0.047663823228989925,
 'Hamburg_klimaschutz': 0.10797645830280174,
 'Hessen_klimaschutz': -0.015731680663638763,
 'Niedersachsen_klimaschutz': 0.14249412785196136,
 'Nordrhein-Westfalen_klimaschutz': 0.08071721615161516,
 'date': 0.04900032235943789}
{'Bayern_klimaschutz': 0.03722281351140024,
 'Berlin_klimaschutz': 0.5317863933653137,
 'Hamburg_klimaschutz': 0.020379163074132452,
 'Hessen_klimaschutz': 0.10976177348461812,
 'Niedersachsen_klimaschutz': 0.23653712472266128,
 'Nordrhein-Westfalen_klimaschutz': -0.07771681162655657,
 'date': 0.04086166711944825}
{'Bayern_klimaschutz': 0.5081336034968045,
 'Berlin_klimaschutz': 0.08257743207045759,
 'Hamburg_klimaschutz': 0.1296655246936484,
 'Hessen_klimaschutz': 1.2698653873121872,
 'Niedersachsen_klimaschutz': 0.5047033951456039,
 'date': 0.09883809680874461}


### LR without co-terms

In [164]:
results, agg_df = evaluate(models["linear_regression"], co_terms=[])
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

  0%|          | 0/200 [00:00<?, ?it/s]



0.34891155034747884

### LR with co-terms

In [165]:
results, agg_df = evaluate(models["linear_regression"], co_terms=co_terms)
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

  0%|          | 0/200 [00:00<?, ?it/s]



0.42488046546663943

In [167]:
for i in range(3):
    coefs = results[i]["model"].coef_[1:]
    coef_keys = list(results[i]["pivot_df"].columns)
    coef_dict = dict(
        intercept=results[i]["model"].coef_[0], **dict(zip(coef_keys, coefs))
    )
    pprint(coef_dict)

{'Bayern_energie': 0.08572986235824859,
 'Bayern_klima': -0.04085274388440199,
 'Bayern_klimakrise': 0.003794549567738942,
 'Bayern_klimaschutz': 0.0245401215794851,
 'Bayern_klimawandel': -0.04018567025794976,
 'Bayern_umwelt': -0.07035097442629695,
 'Bayern_umweltschutz': 0.031674915124234176,
 'Berlin_energie': -0.09435461856107613,
 'Berlin_klima': -0.005322323655145321,
 'Berlin_klimaschutz': -0.01990604069936806,
 'Berlin_klimawandel': -0.009160771582237145,
 'Berlin_umwelt': 0.1291719794336645,
 'Brandenburg_klimaschutz': 0.018886345297489556,
 'Hamburg_energie': -0.01901636570630277,
 'Hamburg_klima': 0.0010393194464942926,
 'Hamburg_klimaschutz': 0.1312327491517297,
 'Hamburg_klimawandel': -0.04094856039927591,
 'Hamburg_umwelt': -0.005601016279916819,
 'Hessen_energie': 0.01812031190311121,
 'Hessen_erderwärmung': -0.030471787476502907,
 'Hessen_klima': 0.014352746558799047,
 'Hessen_klimakrise': -0.02735532676838419,
 'Hessen_klimaschutz': 0.010146922136838018,
 'Hessen_klim

### Random forest with co-terms

In [168]:
results, agg_df = evaluate(models["random_forest"], co_terms=co_terms)
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

  0%|          | 0/200 [00:00<?, ?it/s]



0.38429744777418384

In [170]:
results, agg_df = evaluate(models["gradient_boosting"], co_terms=co_terms)
np.mean(
    [
        metrics(result["df"], result["protest"]["event_date"])["mae"]
        for result in results
    ]
)

  0%|          | 0/200 [00:00<?, ?it/s]



0.39599639916907337