In [84]:
%load_ext autoreload
%autoreload complete

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from matplotlib_inline.backend_inline import set_matplotlib_formats
from tqdm.notebook import tqdm

from src.cache import cache

set_matplotlib_formats("svg")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
import json

from src.paths import external_data

# read all json files from folder data/external/nexis/climate/json
# in files like 2020-01-01/*.json

data = []
for file in tqdm(list(external_data.glob("nexis/climate/json/**/*.json"))):
    with open(file) as f:
        item = json.load(f)
        # parse date field
        item["date"] = pd.to_datetime(item["date"])
        data.append(item)

  0%|          | 0/29013 [00:00<?, ?it/s]

In [90]:
from pprint import pprint

df = pd.DataFrame(data)
df["text"] = df["text"].str.removeprefix(")").str.strip()
# group texts and titles for each date together
df["text"] = (
    df["title"]
    + "\n\n"
    + df["location"].fillna("")
    + " "
    + df["text"].fillna("").str[:1000]
)
df["date"] = df["date"].dt.date
s = df.groupby("date").agg({"text": "\n\n".join})["text"]
# fill missing dates with empty strings
s = s.reindex(
    pd.date_range(pd.Timestamp("2020-01-01"), pd.Timestamp("2022-12-31")), fill_value=""
)
s = (
    s.shift(2)
    + "\n\n---\n\n"
    + s.shift(4)
    + "\n\n---\n\n"
    + s.shift(3)
    + "\n\n---\n\n"
    + s.shift(2)
    + "\n\n---\n\n"
    + s.shift(1)
)
texts = s

# texts.sample(10)

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.naive_bayes import BernoulliNB

from src.models.regression import get_lagged_df

df = get_lagged_df(
    "occ_protest", lags=range(-7, 1), ignore_group=True, region_dummies=True
)
X_ts = df.drop(columns=["occ_protest", "occ_protest_lag0"])
y = df.occ_protest

tscv = TimeSeriesSplit(n_splits=5)
cvs = cross_val_score(BernoulliNB(), X_ts, y, cv=tscv, scoring="f1")
print(f"Cross-validated F1 score: {cvs.mean():.3f} +/- {cvs.std():.3f}")

Cross-validated F1 score: 0.135 +/- 0.045


In [92]:
# import nltk
# nltk.download('stopwords')

In [95]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# use cross_val_score with text processing pipeline (including german stopwords)

vec = TfidfVectorizer(
    stop_words=stopwords.words("german"),
    ngram_range=(1, 2),
    max_features=1000,
    min_df=5,
    max_df=0.8,
    norm="l2",
    sublinear_tf=True,
)
text_clf = Pipeline([("tfidf", vec), ("clf", BernoulliNB())])

X_text = texts.iloc[7:].repeat(13).reset_index(drop=True)
cvs = cross_val_score(text_clf, X_text, y, cv=tscv, scoring="f1", n_jobs=4)
print(f"Cross-validated F1 score: {cvs.mean():.3f} +/- {cvs.std():.3f}")

Cross-validated F1 score: 0.069 +/- 0.021
