## Imports


In [1285]:
import random
import re
import string
import os

import nltk
import numpy as np
import pandas as pd

from collections import Counter

from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

nltk.download("stopwords")
stop_words = set(stopwords.words("spanish"))

pd.set_option("display.max_colwidth", None)

DATA_PATH = "../../data/"

target = "target_layer_3"
label = "resultados"
n_splits = 5

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to /Users/dcast/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read data


In [1286]:
df_labels_orig = pd.read_json(DATA_PATH + "all.json").reset_index(drop=True)

df_fomento = pd.read_json(
    DATA_PATH + "additional_fomento_de_la_participacion.json"
).reset_index(drop=True)
df_colab = pd.read_json(
    DATA_PATH + "additional_colaboracion_ciudadana.json"
).reset_index(drop=True)

df_unlabeled = pd.read_json(DATA_PATH + "tweets_sample.json")
df_validated_labels = pd.read_json(DATA_PATH + "all_labels_20220927.json").reset_index(
    drop=True
)

## Generate dataset with multi-class targets


In [1287]:
layer_1_base = [
    "transparencia",
    "colaboracion_ciudadana",
    "fomento_de_la_participacion",
]
layer_2_transparencia = [
    "agradecimientos_y_actos_simbolicos",
    "decisiones",
    "estado_de_servicio",
]
layer_3_decisiones = ["racionalidad", "contenido", "resultados"]

In [1288]:
df_updated = df_unlabeled[df_labels_orig.columns[:-1]].merge(
    df_validated_labels[["id_tweet", "label"]], on="id_tweet", how="inner"
)

assert ~df_updated.id_tweet.isin(
    df_labels_orig.id_tweet
).all()  # Updated tweets are not in the original dataset

In [1289]:
def generate_target_layers(df):
    df["target_layer_1"] = (
        df.label.astype(str).str.findall("|".join(layer_1_base)).str[0]
    )
    df["target_layer_2"] = (
        df.label.astype(str).str.findall("|".join(layer_2_transparencia)).str[0]
    )
    df["target_layer_3"] = (
        df.label.astype(str).str.findall("|".join(layer_3_decisiones)).str[0]
    )
    return df


df = pd.concat([df_labels_orig, df_updated]).reset_index(drop=True)
df = generate_target_layers(df)

In [1290]:
def generate_processed_df(df_proc, ids_to_exclude, label):
    df_proc_ = df_proc[
        (df_proc.label.str[0] == label) & ~(df_proc.id_tweet.isin(ids_to_exclude))
    ].copy()
    df_proc_["target_layer_1"] = df_proc_.label.str[0]
    df_proc_["target_layer_2"] = np.nan
    df_proc_["target_layer_3"] = np.nan
    return df_proc_


df_fomento_reduc = generate_processed_df(
    df_fomento, df.id_tweet, "fomento_de_la_participacion"
)

wrong_ids_colab = [
    3015,
    3823,
    10551,
    9041,
    11775,
    11181,
    9354,
]  # Email thread: Etiquetado adicional
df_colab_reduc = generate_processed_df(
    df_colab,
    np.concatenate([df.id_tweet, df_fomento_reduc.id_tweet, wrong_ids_colab]),
    "colaboracion_ciudadana",
)

assert ~df_fomento_reduc.link.isin(df.link).any()
assert ~df_colab_reduc.link.isin(df.link).any()
assert ~df_colab_reduc.link.isin(df_fomento_reduc.link).any()

In [1291]:
df_output = df.copy()
df_labels_orig.shape, df_updated.shape, df_fomento_reduc.shape, df_colab_reduc.shape, df_output.shape

((1000, 30), (2100, 30), (93, 43), (74, 43), (3100, 33))

In [1292]:
df_labeling_1 = df_output[:999][["text", "target_layer_1", "target_layer_2", "target_layer_3"]].copy()
df_labeling_2 = df_output[999:2098][["text", "target_layer_1", "target_layer_2", "target_layer_3"]].copy()
df_labeling_3 = df_output[2098:][["text", "target_layer_1", "target_layer_2", "target_layer_3"]].copy()

In [1293]:
dfs = []
for i, df_ in enumerate([df_labeling_1, df_labeling_2, df_labeling_3]):
    dfs.append(
        pd.DataFrame(df_.value_counts(target, normalize=True))
        .T.assign(labeling=i)
    )
pd.concat(dfs)

target_layer_3,resultados,contenido,racionalidad,labeling
0,0.666667,0.205273,0.12806,0
0,0.722496,0.16913,0.108374,1
0,0.717996,0.200371,0.081633,2


In [1294]:
def generate_labeling_df(target, label):
    return pd.concat(
        [
            df_labeling_1.query(f"{target} == '{label}'").assign(labeling=1),
            df_labeling_2.query(f"{target} == '{label}'").assign(labeling=2),
            df_labeling_3.query(f"{target} == '{label}'").assign(labeling=3),
        ]
    )

df_labeling_comp = generate_labeling_df(target, label)

In [1295]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df_labeling_comp["clean_text"] = df_labeling_comp.text.map(process_text)

In [1296]:
def prepare_text(text, remove_stopwords=True):
        text = str(text).lower()
        text = re.sub(
            f"[{re.escape(string.punctuation)}]", " ", text
        )

        tokens = text.split()

        if remove_stopwords:
            tokens = [t for t in tokens if t not in stop_words]
        return " ".join(tokens)

def find_most_common(df_, n=100, ngr=1):
    if ngr == 1:
        tokens = nltk.word_tokenize(
            "\n".join(df_.query(f"{target} == '{label}'").text.map(prepare_text).tolist()),
            language="spanish",
        )
    else:
        tokens = nltk.word_tokenize(
            "\n".join(df_.query(f"{target} == '{label}'").text.map(lambda x: prepare_text(x, True)).tolist()),
            language="spanish",
        )
    return Counter(ngrams(tokens, ngr)).most_common(n)

for n in [1, 2]:
    pd.DataFrame(
        find_most_common(df_labeling_1, ngr=n)
    ).rename(columns={0: "word", 1: f"{label}_1"}).merge(
        pd.DataFrame(
            find_most_common(df_labeling_2, ngr=n)
        ).rename(columns={0: "word", 1: f"{label}_2"}),
        on="word",
        how="outer"
    ).merge(
        pd.DataFrame(
            find_most_common(df_labeling_3, ngr=n)
        ).rename(columns={0: "word", 1: f"{label}_3"}),
        on="word",
        how="outer"
    ).to_excel(DATA_PATH + f"drift/most_common_{label}_{n}gram.xlsx", index=False)

In [1297]:
accs_diff = []
df_labeling_comp = df_labeling_comp.sample(frac=1).reset_index(drop=True)
df_labeling_comp["kfold"] = -1

kf = StratifiedKFold(n_splits=n_splits)

for f, (t, v) in enumerate(kf.split(X=df_labeling_comp, y=df_labeling_comp.labeling)):
    df_labeling_comp.loc[v, "kfold"] = f

for f in range(n_splits):
    df_train = df_labeling_comp[df_labeling_comp.kfold != f].reset_index(drop=True)
    df_val = df_labeling_comp[df_labeling_comp.kfold == f].reset_index(drop=True)
    
    print(f"Split {f}: {df_train.shape[0]} observations for training / {df_val.shape[0]} observations for validation")
    vec = CountVectorizer(
        ngram_range=(1, 3), 
        stop_words="english",
    )
    svd = TruncatedSVD(n_components=120)

    X_count = vec.fit_transform(df_train.clean_text)
    X_train = svd.fit_transform(X_count)
    X_val = svd.transform(vec.transform(df_val.clean_text))

    y_train = df_train.labeling
    y_val = df_val.labeling

    model = LogisticRegression(solver="liblinear")
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    accs_diff.append(
        accuracy_score(y_val, preds) - df_val.labeling.value_counts(normalize=True).max()
    )

print("\nClassification report (last fold):")
print(classification_report(y_val, preds, zero_division=0))

Split 0: 944 observations for training / 237 observations for validation
Split 1: 945 observations for training / 236 observations for validation
Split 2: 945 observations for training / 236 observations for validation
Split 3: 945 observations for training / 236 observations for validation
Split 4: 945 observations for training / 236 observations for validation

Classification report (last fold):
              precision    recall  f1-score   support

           1       0.36      0.25      0.30        71
           2       0.39      0.53      0.45        88
           3       0.35      0.30      0.32        77

    accuracy                           0.37       236
   macro avg       0.37      0.36      0.36       236
weighted avg       0.37      0.37      0.36       236



In [1298]:
accs_diff

[-0.04219409282700426,
 -0.01694915254237289,
 -0.008474576271186474,
 -0.1059322033898305,
 0.0]

In [1299]:
print(f"Mean accuracy delta: {np.mean(accs_diff):.2f} +/- {np.std(accs_diff):.2f}")

Mean accuracy delta: -0.03 +/- 0.04
