In [84]:
!pip install --quiet sentence-transformers emoji

In [85]:
try:
    from google.colab import drive

    drive.mount("/content/drive")
    DATA_PATH = "/content/drive/MyDrive/nlp-tweets-classification/"
    ARTIFACTS_PATH = "/content/drive/MyDrive/nlp-tweets-classification/"
except ModuleNotFoundError:
    DATA_PATH = "../../data/"
    ARTIFACTS_PATH = "../../artifacts/"
    running_in_colab = False


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports


In [86]:
import os
import joblib
import random
import uuid

from emoji import demojize
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    silhouette_samples,
    silhouette_score,
    cohen_kappa_score,
    precision_score,
    recall_score,
    average_precision_score,
    f1_score,
    balanced_accuracy_score,
    classification_report,
)
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    StratifiedShuffleSplit,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier

pd.set_option("display.max_colwidth", None)

SEED = 42

random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)


## Utility functions


In [87]:
def clean_text(text):
    tokens = text.split()
    tokens = [t for t in tokens if "http" not in t]
    tokens = [demojize(t, language="es") for t in tokens]
    tokens = [t.replace("@", "") for t in tokens]
    return " ".join(tokens)


def get_hyperparams_space(model_name):
    if model_name == "log":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100],
            penalty=["l1", "l2"],
            class_weight=["balanced", None],
        )
    elif model_name == "rf":
        hyperparams = dict(
            n_estimators=[120, 500, 1200],
            max_depth=[5, 8, 15, 25],
            class_weight=["balanced", None],
        )
    elif model_name == "svc":
        hyperparams = dict(
            C=[0.001, 0.01, 0.1, 1, 10, 100], class_weight=["balanced", None]
        )
    elif model_name == "xgb":
        hyperparams = dict(
            eta=[0.01, 0.05, 0.1],
            gamma=[0.1, 0.5, 1],
            max_depth=[3, 12, 25],
            min_child_weight=[1, 3, 7],
            subsample=[0.6, 0.8, 1],
            colsample_bytree=[0.6, 0.8, 1],
        )
    elif model_name == "nb":
        hyperparams = dict()
    elif model_name == "lgbm":
        hyperparams = dict(
            learning_rate=[0.01, 0.05, 0.1],
        )
    else:
        raise Exception(f"No hyperparams for model {model_name}")
    return hyperparams


def get_model(model_name):
    if model_name == "lgbm":
        return LGBMClassifier()
    elif model_name == "hist":
        return HistGradientBoostingClassifier()
    elif model_name == "log":
        return LogisticRegression(class_weight="balanced", solver="liblinear")
    elif model_name == "rf":
        return RandomForestClassifier(
            n_jobs=-1,
            n_estimators=1200,
        )
    elif model_name == "nb":
        return GaussianNB()
    elif model_name == "xgb":
        return XGBClassifier(
            tree_method="hist", use_label_encoder=False, max_depth=10, eta=0.1
        )
    elif model_name == "svc":
        return LinearSVC(class_weight="balanced")
    else:
        raise ValueError(format)


def get_mode_rows(a):
    a = np.ascontiguousarray(a)
    void_dt = np.dtype((np.void, a.dtype.itemsize * np.prod(a.shape[1:])))
    _, ids, count = np.unique(a.view(void_dt).ravel(), return_index=1, return_counts=1)
    largest_count_id = ids[count.argmax()]
    most_frequent_row = a[largest_count_id]
    return most_frequent_row


## Set notebook parameters


In [88]:
col_target = "target_layer_3"
dataset_name = "all_multiclass_20220911.json"
model_name = "log"
n_splits = 10
use_precalculated_embeddings = True 
run_hyperparams_search = True 
use_full_dataset = True

## Read data


In [89]:
df = pd.read_json(DATA_PATH + dataset_name).dropna(subset=[col_target])
df_translated = pd.read_csv(DATA_PATH + "tweets_traducidos.csv")
df_translated["text"] = df_translated.texto_traducido.combine_first(df_translated.text)

if not use_full_dataset:
    df_sample = pd.read_json(DATA_PATH + "tweets_sample.json")
    df_test = df_sample[~df_sample.id.isin(df.id)].reset_index()
else:
    if col_target == "target_layer_1":
        df_test = (
            pd.read_csv(DATA_PATH + "tweets_traducidos.csv")
        )
        df_test["text"] = df_test.texto_traducido.combine_first(df_test.text)
    elif col_target == "target_layer_2":
        df_test = (
            pd.read_csv(DATA_PATH + "output_target_layer_1.csv")
            .query("target_layer_1 == 'transparencia'")
            .reset_index()
        )
    elif col_target == "target_layer_3":
        df_test = (
            pd.read_csv(DATA_PATH + "output_target_layer_2.csv")
            .query("target_layer_2 == 'decisiones'")
            .reset_index()
        )

In [90]:
assert df_translated.link.notna().all()
assert df_translated.link.nunique() == df_translated.shape[0]


## Create embeddings


In [91]:
if use_precalculated_embeddings:
    df_embeddings = pd.read_json(DATA_PATH + "tweets_embeddings.json")
else:
    from sentence_transformers import SentenceTransformer

    cleaned_tweets = [clean_text(tweet) for tweet in df_translated.text]

    model = SentenceTransformer("hiiamsid/sentence_similarity_spanish_es")
    embeddings = model.encode(cleaned_tweets)

    df_embeddings = pd.concat([df_translated["link"], pd.DataFrame(embeddings)], axis=1)
    df_embeddings.to_json(DATA_PATH + "tweets_embeddings.json")

assert df.link.isin(df_embeddings.link).all()
assert df_translated.link.isin(df_embeddings.link).all()


## Train model


In [92]:
df_results = pd.DataFrame()

x_test = df_test[["link"]].merge(df_embeddings, how="inner", on="link").drop(columns="link")
df_label = (
    df[["link", col_target]].merge(df_embeddings, how="inner", on="link").drop(columns="link")
)

le = LabelEncoder()
le.fit(df_label[col_target])
joblib.dump(le, ARTIFACTS_PATH + f"{col_target}/label_encoder.joblib")

df_label = df_label.sample(frac=1).reset_index(drop=True).dropna()
df_label["kfold"] = -1

if n_splits > 1:
    kf = StratifiedKFold(n_splits=n_splits)
    for f, (t, v) in enumerate(kf.split(X=df_label, y=df_label[col_target])):
        df_label.loc[v, "kfold"] = f
else:
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.15)
    f = next(ss.split(X=df_label, y=df_label[col_target]))
    t = f[0]
    v = f[1]
    df_label.loc[v, "kfold"] = 0

f1_scores = []
kappa_scores = []
precision_scores = []
recall_scores = []
acc_scores = []
test_probas = []
l1_f1_scores = []
l2_f1_scores = []
l3_f1_scores = []
l1_precision_scores = []
l2_precision_scores = []
l3_precision_scores = []
l1_recall_scores = []
l2_recall_scores = []
l3_recall_scores = []

print(f"Training {model_name}")
for f in range(n_splits):
    df_train = df_label[df_label.kfold != f].reset_index(drop=True)
    df_val = df_label[df_label.kfold == f].reset_index(drop=True)

    print(
        f"Split {f}: {df_train.shape[0]} observations for training / {df_val.shape[0]} observations for validation"
    )
    x_train = df_train.iloc[:, 1:-1]
    x_val = df_val.iloc[:, 1:-1]
    y_train = le.transform(df_train[col_target])
    y_val = le.transform(df_val[col_target])

    if run_hyperparams_search:
        space = get_hyperparams_space(model_name)
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True)
        search = GridSearchCV(
            get_model(model_name), space, scoring="f1_macro", cv=cv_inner, refit=True
        )
        result = search.fit(x_train, y_train)
        model = result.best_estimator_
    else:
        model = get_model(model_name)

    joblib.dump(model, ARTIFACTS_PATH + f"{col_target}/model_{model_name}_{f}.joblib")

    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    test_probas.append(model.predict_proba(x_test))

    f1_ind_scores = f1_score(y_val, preds, average=None)
    precision_ind_scores = precision_score(y_val, preds, average=None)
    recall_ind_scores = recall_score(y_val, preds, average=None)

    l1_f1_scores.append(f1_ind_scores[0])
    l2_f1_scores.append(f1_ind_scores[1])
    l3_f1_scores.append(f1_ind_scores[2])

    l1_precision_scores.append(precision_ind_scores[0])
    l2_precision_scores.append(precision_ind_scores[1])
    l3_precision_scores.append(precision_ind_scores[2])

    l1_recall_scores.append(recall_ind_scores[0])
    l2_recall_scores.append(recall_ind_scores[1])
    l3_recall_scores.append(recall_ind_scores[2])

    f1_scores.append(f1_score(y_val, preds, average="macro"))
    kappa_scores.append(cohen_kappa_score(y_val, preds))
    precision_scores.append(precision_score(y_val, preds, average="macro"))
    recall_scores.append(recall_score(y_val, preds, average="macro"))

    # print(classification_report(y_val, preds, target_names=le.classes_))
    print(f"f1: {np.mean(f1_scores):.2f}, kappa: {np.mean(kappa_scores):.2f}")

df_results = pd.concat(
    [
        df_results,
        pd.DataFrame(
            dict(
                target=col_target,
                model=[model_name],
                n_splits=n_splits,
                f1=np.mean(f1_scores),
                kappa=np.mean(kappa_scores),
                precision=np.mean(precision_scores),
                recall=np.mean(recall_scores),
                l1_f1=np.mean(l1_f1_scores),
                l2_f1=np.mean(l2_f1_scores),
                l3_f1=np.mean(l3_f1_scores),
                l1_precision=np.mean(l1_precision_scores),
                l2_precision=np.mean(l2_precision_scores),
                l3_precision=np.mean(l3_precision_scores),
                l1_recall=np.mean(l1_recall_scores),
                l2_recall=np.mean(l2_recall_scores),
                l3_recall=np.mean(l3_recall_scores),
                f1_scores=str([round(s, 2) for s in f1_scores]),
                kappa_scores=str([round(s, 2) for s in kappa_scores]),
                precision_scores=str([round(s, 2) for s in precision_scores]),
                recall_scores=str([round(s, 2) for s in recall_scores]),
            )
        ),
    ]
)

df_results = df_results.rename(
    columns={
        "l1_f1": f"{le.classes_[0]}_f1",
        "l2_f1": f"{le.classes_[1]}_f1",
        "l3_f1": f"{le.classes_[2]}_f1",
        "l1_precision": f"{le.classes_[0]}_precision",
        "l2_precision": f"{le.classes_[1]}_precision",
        "l3_precision": f"{le.classes_[2]}_precision",
        "l1_recall": f"{le.classes_[0]}_recall",
        "l2_recall": f"{le.classes_[1]}_recall",
        "l3_recall": f"{le.classes_[2]}_recall",
    }
)


Training log
Split 0: 1002 observations for training / 112 observations for validation
f1: 0.63, kappa: 0.48
Split 1: 1002 observations for training / 112 observations for validation
f1: 0.63, kappa: 0.51
Split 2: 1002 observations for training / 112 observations for validation
f1: 0.63, kappa: 0.48
Split 3: 1002 observations for training / 112 observations for validation
f1: 0.60, kappa: 0.44
Split 4: 1003 observations for training / 111 observations for validation
f1: 0.60, kappa: 0.44
Split 5: 1003 observations for training / 111 observations for validation
f1: 0.62, kappa: 0.45
Split 6: 1003 observations for training / 111 observations for validation
f1: 0.63, kappa: 0.46
Split 7: 1003 observations for training / 111 observations for validation
f1: 0.64, kappa: 0.47
Split 8: 1003 observations for training / 111 observations for validation
f1: 0.64, kappa: 0.48
Split 9: 1003 observations for training / 111 observations for validation
f1: 0.65, kappa: 0.49


## Save results


In [93]:
results_path = DATA_PATH + "results.csv"
if run_hyperparams_search:
    results_path = DATA_PATH + "results_best_hyperparams.csv"
df_results.to_csv(
    results_path, mode="a", header=not os.path.exists(results_path), index=False
)

In [94]:
df_results.T

Unnamed: 0,0
target,target_layer_3
model,log
n_splits,10
f1,0.648702
kappa,0.493639
precision,0.657647
recall,0.650041
contenido_f1,0.544801
racionalidad_f1,0.547427
resultados_f1,0.853879


In [95]:
df_train[col_target].value_counts(normalize=True)

resultados      0.673978
contenido       0.218345
racionalidad    0.107677
Name: target_layer_3, dtype: float64

In [96]:
col_probas = []
for j in range(x_test.shape[0]):
  temp_probas = np.array([test_probas[i][j] for i in range(len(test_probas))])
  col_probas.append(temp_probas.sum(axis=0) / temp_probas.sum())

df_test[col_target] = le.inverse_transform(np.argmax(np.array(col_probas), axis=1))

df_probas = pd.DataFrame(col_probas).rename(
    columns={
        0: "prob_" + le.inverse_transform([0])[0],
        1: "prob_" + le.inverse_transform([1])[0],
        2: "prob_" + le.inverse_transform([2])[0],
    }
)

# concat df_test with df_counts
rows_prev = df_test.shape[0]
df_test = pd.concat([df_test, df_probas], axis=1)

assert df_test.shape[0] == rows_prev

In [97]:
if use_full_dataset:
    df_test[["id_tweet", "link", "text", col_target] + df_probas.columns.tolist()].to_csv(
        DATA_PATH + f"output_{col_target}.csv",
        index=False,
    )
else:
    df_test[["id_tweet", "link", "text", col_target] + df_probas.columns.tolist()].to_csv(
        DATA_PATH + f"sample_preds_{col_target}.csv",
        index=False,
    )