In [None]:
!pip install --quiet sentence-transformers emoji

In [None]:
try:
    from google.colab import drive

    drive.mount("/content/drive")
    DATA_PATH = "/content/drive/MyDrive/nlp-tweets-classification/ciudadanos/"
    ARTIFACTS_PATH = "/content/drive/MyDrive/nlp-tweets-classification/ciudadanos/"
except ModuleNotFoundError:
    DATA_PATH = "../../data/"
    ARTIFACTS_PATH = "../../artifacts/"
    running_in_colab = False


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports


In [None]:
import os
import joblib
import random
import uuid

from emoji import demojize
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    silhouette_samples,
    silhouette_score,
    cohen_kappa_score,
    precision_score,
    recall_score,
    average_precision_score,
    f1_score,
    balanced_accuracy_score,
    classification_report,
    accuracy_score
)
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    StratifiedShuffleSplit,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier

pd.set_option("display.max_colwidth", None)

SEED = 42

random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)


## Utility functions


In [None]:
def clean_text(text):
    tokens = text.split()
    tokens = [t for t in tokens if "http" not in t]
    tokens = [demojize(t, language="es") for t in tokens]
    tokens = [t.replace("@", "") for t in tokens]
    return " ".join(tokens)


def get_hyperparams_space(model_name):
    if model_name == "log":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100],
            penalty=["l1", "l2"],
            class_weight=["balanced", None],
        )
    elif model_name == "rf":
        hyperparams = dict(
            n_estimators=[120, 500, 1200],
            max_depth=[5, 8, 15, 25],
            class_weight=["balanced", None],
        )
    elif model_name == "svc":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100], class_weight=["balanced", None]
        )
    elif model_name == "xgb":
        hyperparams = dict(
            eta=[0.01, 0.05, 0.1],
            gamma=[0.1, 0.5, 1],
            max_depth=[3, 12, 25],
            min_child_weight=[1, 3, 7],
            subsample=[0.6, 0.8, 1],
            colsample_bytree=[0.6, 0.8, 1],
        )
    elif model_name == "nb":
        hyperparams = dict()
    elif model_name == "lgbm":
        hyperparams = dict(
            learning_rate=[0.01, 0.05, 0.1],
        )
    else:
        raise Exception(f"No hyperparams for model {model_name}")
    return hyperparams


def get_model(model_name):
    if model_name == "lgbm":
        return LGBMClassifier()
    elif model_name == "hist":
        return HistGradientBoostingClassifier()
    elif model_name == "log":
        return LogisticRegression(class_weight="balanced", solver="liblinear")
    elif model_name == "rf":
        return RandomForestClassifier(
            n_jobs=-1,
            n_estimators=1200,
        )
    elif model_name == "nb":
        return GaussianNB()
    elif model_name == "xgb":
        return XGBClassifier(
            tree_method="hist", use_label_encoder=False, max_depth=10, eta=0.1
        )
    elif model_name == "svc":
        return LinearSVC(class_weight="balanced")
    else:
        raise ValueError(format)

## Set notebook parameters


In [None]:
col_target = "label"
dataset_name = "all_citizens_labeled_20220911.json"
model_name = "log"
n_splits = 10
use_precalculated_embeddings = True
run_hyperparams_search = True 
use_full_dataset = True 

## Read data


In [None]:
df = pd.read_json(DATA_PATH + dataset_name).reset_index(drop=True)
df_ayuntamientos = pd.read_csv(DATA_PATH + 'tweets_traducidos.csv', low_memory= False)
ayuntamientos = df_ayuntamientos.author.unique()

df_full = pd.read_csv(DATA_PATH + "all_citizens_joined.csv").drop_duplicates(subset=["id_tweet"])
df_full["is_ayuntamiento"] = df_full.author.isin(ayuntamientos)

df_test = df_full.loc[
    (df_full.relation != "RT") & (df_full.lang == "es") & (df_full.is_ayuntamiento == False)
].reset_index(drop=True)

if not use_full_dataset:
    df_test = df_test.sample(1000).reset_index(drop=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


## Create embeddings


In [None]:
if use_precalculated_embeddings:
    df_embeddings = pd.read_json(DATA_PATH + "tweets_embeddings_ciudadanos.json")
else:
    from sentence_transformers import SentenceTransformer

    cleaned_tweets = [clean_text(tweet) for tweet in df_full.text]

    model = SentenceTransformer("hiiamsid/sentence_similarity_spanish_es")
    embeddings = model.encode(cleaned_tweets)

    df_embeddings = pd.concat([df_full["link"], pd.DataFrame(embeddings)], axis=1)
    df_embeddings.to_json(DATA_PATH + "tweets_embeddings_ciudadanos.json")

assert df.link.isin(df_embeddings.link).all()

## Train model


In [None]:
df_results = pd.DataFrame()

x_test = df_test[["link"]].merge(df_embeddings, how="inner", on="link").drop(columns="link")
df_label = (
    df[["link", col_target]].merge(df_embeddings, how="inner", on="link").drop(columns="link")
)

le = LabelEncoder()
le.fit(df_label[col_target])
joblib.dump(le, ARTIFACTS_PATH + f"label_encoder_ciudadanos.joblib")

df_label = df_label.sample(frac=1).reset_index(drop=True).dropna()
df_label["kfold"] = -1

if n_splits > 1:
    kf = StratifiedKFold(n_splits=n_splits)
    for f, (t, v) in enumerate(kf.split(X=df_label, y=df_label[col_target])):
        df_label.loc[v, "kfold"] = f
else:
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.15)
    f = next(ss.split(X=df_label, y=df_label[col_target]))
    t = f[0]
    v = f[1]
    df_label.loc[v, "kfold"] = 0

f1_scores = []
kappa_scores = []
precision_scores = []
recall_scores = []
acc_scores = []
test_probas = []
l1_f1_scores = []
l2_f1_scores = []
l3_f1_scores = []
l4_f1_scores = []
l5_f1_scores = []
l1_precision_scores = []
l2_precision_scores = []
l3_precision_scores = []
l4_precision_scores = []
l5_precision_scores = []
l1_recall_scores = []
l2_recall_scores = []
l3_recall_scores = []
l4_recall_scores = []
l5_recall_scores = []

print(f"Training {model_name}")
for f in range(n_splits):
    df_train = df_label[df_label.kfold != f].reset_index(drop=True)
    df_val = df_label[df_label.kfold == f].reset_index(drop=True)

    print(
        f"Split {f}: {df_train.shape[0]} observations for training / {df_val.shape[0]} observations for validation"
    )
    x_train = df_train.iloc[:, 1:-1]
    x_val = df_val.iloc[:, 1:-1]
    y_train = le.transform(df_train[col_target])
    y_val = le.transform(df_val[col_target])

    if run_hyperparams_search:
        space = get_hyperparams_space(model_name)
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
        search = GridSearchCV(
            get_model(model_name), space, scoring="f1_macro", cv=cv_inner, refit=True
        )
        result = search.fit(x_train, y_train)
        model = result.best_estimator_
    else:
        model = get_model(model_name)

    joblib.dump(model, ARTIFACTS_PATH + f"model_{model_name}_{f}.joblib")

    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    test_probas.append(model.predict_proba(x_test))

    f1_ind_scores = f1_score(y_val, preds, average=None)
    precision_ind_scores = precision_score(y_val, preds, average=None)
    recall_ind_scores = recall_score(y_val, preds, average=None)

    l1_f1_scores.append(f1_ind_scores[0])
    l2_f1_scores.append(f1_ind_scores[1])
    l3_f1_scores.append(f1_ind_scores[2])
    l4_f1_scores.append(f1_ind_scores[0])
    l5_f1_scores.append(f1_ind_scores[1])

    l1_precision_scores.append(precision_ind_scores[0])
    l2_precision_scores.append(precision_ind_scores[1])
    l3_precision_scores.append(precision_ind_scores[2])
    l4_precision_scores.append(precision_ind_scores[3])
    l5_precision_scores.append(precision_ind_scores[4])

    l1_recall_scores.append(recall_ind_scores[0])
    l2_recall_scores.append(recall_ind_scores[1])
    l3_recall_scores.append(recall_ind_scores[2])
    l4_recall_scores.append(recall_ind_scores[3])
    l5_recall_scores.append(recall_ind_scores[4])

    f1_scores.append(f1_score(y_val, preds, average="macro"))
    kappa_scores.append(cohen_kappa_score(y_val, preds))
    precision_scores.append(precision_score(y_val, preds, average="macro"))
    recall_scores.append(recall_score(y_val, preds, average="macro"))
    acc_scores.append(accuracy_score(y_val, preds))

    # print(classification_report(y_val, preds, target_names=le.classes_))
    print(f"f1: {np.mean(f1_scores):.2f}, kappa: {np.mean(kappa_scores):.2f}")

df_results = pd.concat(
    [
        df_results,
        pd.DataFrame(
            dict(
                target=col_target,
                model=[model_name],
                n_splits=n_splits,
                f1=np.mean(f1_scores),
                accuracy=np.mean(acc_scores),
                kappa=np.mean(kappa_scores),
                precision=np.mean(precision_scores),
                recall=np.mean(recall_scores),
                l1_f1=np.mean(l1_f1_scores),
                l2_f1=np.mean(l2_f1_scores),
                l3_f1=np.mean(l3_f1_scores),
                l4_f1=np.mean(l4_f1_scores),
                l5_f1=np.mean(l5_f1_scores),
                l1_precision=np.mean(l1_precision_scores),
                l2_precision=np.mean(l2_precision_scores),
                l3_precision=np.mean(l3_precision_scores),
                l4_precision=np.mean(l4_precision_scores),
                l5_precision=np.mean(l5_precision_scores),
                l1_recall=np.mean(l1_recall_scores),
                l2_recall=np.mean(l2_recall_scores),
                l3_recall=np.mean(l3_recall_scores),
                l4_recall=np.mean(l4_recall_scores),
                l5_recall=np.mean(l5_recall_scores),
                f1_scores=str([round(s, 2) for s in f1_scores]),
                kappa_scores=str([round(s, 2) for s in kappa_scores]),
                precision_scores=str([round(s, 2) for s in precision_scores]),
                recall_scores=str([round(s, 2) for s in recall_scores]),
            )
        ),
    ]
)

df_results = df_results.rename(
    columns={
        "l1_f1": f"{le.classes_[0]}_f1",
        "l2_f1": f"{le.classes_[1]}_f1",
        "l3_f1": f"{le.classes_[2]}_f1",
        "l4_f1": f"{le.classes_[3]}_f1",
        "l5_f1": f"{le.classes_[4]}_f1",
        "l1_precision": f"{le.classes_[0]}_precision",
        "l2_precision": f"{le.classes_[1]}_precision",
        "l3_precision": f"{le.classes_[2]}_precision",
        "l4_precision": f"{le.classes_[3]}_precision",
        "l5_precision": f"{le.classes_[4]}_precision",
        "l1_recall": f"{le.classes_[0]}_recall",
        "l2_recall": f"{le.classes_[1]}_recall",
        "l3_recall": f"{le.classes_[2]}_recall",
        "l4_recall": f"{le.classes_[3]}_recall",
        "l5_recall": f"{le.classes_[4]}_recall",
    }
)

## Save performance metrics 


In [None]:
results_path = DATA_PATH + "results.csv"
if run_hyperparams_search:
    results_path = DATA_PATH + "results_best_hyperparams.csv"
df_results.to_csv(results_path, index=False)

In [None]:
df_results.T

In [None]:
df_train[col_target].value_counts(normalize=True)

## Add probabilities and predictions to dataset 

In [None]:
col_probas = []
for j in range(x_test.shape[0]):
  temp_probas = np.array([test_probas[i][j] for i in range(len(test_probas))])
  col_probas.append(temp_probas.sum(axis=0) / temp_probas.sum())

df_test[col_target] = le.inverse_transform(np.argmax(np.array(col_probas), axis=1))

df_probas = pd.DataFrame(col_probas).rename(
    columns={
        0: "prob_" + le.inverse_transform([0])[0],
        1: "prob_" + le.inverse_transform([1])[0],
        2: "prob_" + le.inverse_transform([2])[0],
        3: "prob_" + le.inverse_transform([3])[0],
        4: "prob_" + le.inverse_transform([4])[0],
    }
)

# concat df_test with df_counts
rows_prev = df_test.shape[0]
df_test = pd.concat([df_test, df_probas], axis=1)

assert df_test.shape[0] == rows_prev

## Add sentiment analysis and relational user columns

In [None]:
df_additional_cols = pd.read_csv(DATA_PATH + "citizens_final_with_sentiment_analysis_and_relational_user_col_final.csv")
df_additional_cols = (
    df_additional_cols.drop(
    columns=[
        'text', 
        'label', 
        'destructivo_proba', 
        'entretenido_proba',
        'expresivo_proba', 
        'informativo_colaborador_proba',
        'informativo_demandante_proba']
  ).rename(
      columns={ 
          "NEG": "prob_neg",
          "NEU": "prob_neu",
          "POS": "prob_pos",
          "others": "prob_others",
          "joy": "prob_joy",
          "sadness": "prob_sadness",
          "anger": "prob_anger",
          "surprise": "prob_surprise",
          "disgust": "prob_disgust",
          "fear": "prob_fear",
          "hateful": "prob_hateful",
          "targeted": "prob_targeted",
          "aggressive": "prob_aggressive"
      }
  ).drop_duplicates(subset=["id tweet"])
)
df_additional_cols.columns = df_additional_cols.columns.str.lower().str.replace(" ", "_")

## Save final output

In [None]:
df_test.columns

In [None]:
df_output = df_test.merge(
    df_additional_cols, 
    how="left", 
    on="id_tweet", 
    validate="1:1"
) 

df_output.to_csv(DATA_PATH + f"output_final_ciudadanos.csv", index=False)
df_output.sample(1000).to_csv(DATA_PATH + f"sample_output_ciudadanos.csv", index=False)